#include "Simple2xScaler.hh"
#include "SuperImposedVideoFrame.hh"
#include "LineScalers.hh"
#include "RawFrame.hh"
#include "ScalerOutput.hh"
#include "RenderSettings.hh"
#include "unreachable.hh"
#include "vla.hh"
#include <cassert>
#include <cstdint>
#ifdef __SSE2__
#include <emmintrin.h>
#endif

namespace openmsx {

// class Simple2xScaler

template <class Pixel>
Simple2xScaler<Pixel>::Simple2xScaler(
		const PixelOperations<Pixel>& pixelOps_,
		RenderSettings& renderSettings)
	: Scaler2<Pixel>(pixelOps_)
	, settings(renderSettings)
	, pixelOps(pixelOps_)
	, mult1(pixelOps)
	, mult2(pixelOps)
	, mult3(pixelOps)
	, scanline(pixelOps)
{
}

template <class Pixel>
void Simple2xScaler<Pixel>::scaleBlank1to2(
		FrameSource& src, unsigned srcStartY, unsigned srcEndY,
		ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
{
	int scanlineFactor = settings.getScanlineFactor();

	unsigned dstHeight = dst.getHeight();
	unsigned stopDstY = (dstEndY == dstHeight)
	                  ? dstEndY : dstEndY - 2;
	unsigned srcY = srcStartY, dstY = dstStartY;
	for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
		Pixel color0 = src.getLineColor<Pixel>(srcY);
		dst.fillLine(dstY + 0, color0);
		Pixel color1 = scanline.darken(color0, scanlineFactor);
		dst.fillLine(dstY + 1, color1);
	}
	if (dstY != dstHeight) {
		unsigned nextLineWidth = src.getLineWidth(srcY + 1);
		assert(src.getLineWidth(srcY) == 1);
		assert(nextLineWidth != 1);
		this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
		                    dst, dstY, dstEndY);
	}
}

#ifdef __SSE2__

// Combines upper-half of 'x' with lower half of 'y'.
__m128i shuffle(__m128i x, __m128i y)
{
	// mm_shuffle_pd() actually shuffles 64-bit floating point values, we
	// need to shuffle integers. Though floats and ints are stored in the
	// same xmmN registers. So this instruction does the right thing.
	// However (some?) x86 CPUs keep the float and integer interpretations
	// of these registers in different physical locations in the chip and
	// there is some overhead on switching between these interpretations.
	// So the casts in the statement below don't generate any instructions,
	// but they still can cause overhead on (some?) CPUs.
	return _mm_castpd_si128(_mm_shuffle_pd(
		_mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
}

// 32bpp
void blur1on2_SSE2(const uint32_t* __restrict in_, uint32_t* __restrict out_,
                   unsigned c1_, unsigned c2_, unsigned long width)
{
	width *= sizeof(uint32_t); // in bytes
	assert(width >= (2 * sizeof(__m128i)));
	assert((reinterpret_cast<long>(in_ ) % sizeof(__m128i)) == 0);
	assert((reinterpret_cast<long>(out_) % sizeof(__m128i)) == 0);

	long x = -long(width - sizeof(__m128i));
	auto* in  = reinterpret_cast<const char*>(in_ ) -     x;
	auto* out = reinterpret_cast<      char*>(out_) - 2 * x;

	// Setup first iteration
	__m128i c1 = _mm_set1_epi16(c1_);
	__m128i c2 = _mm_set1_epi16(c2_);
	__m128i zero = _mm_setzero_si128();

	__m128i abcd = *reinterpret_cast<const __m128i*>(in);
	__m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
	__m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
	__m128i d1a1 = _mm_mullo_epi16(c1, d0a0);

	// Each iteration reads 4 pixels and generates 8 pixels
	do {
		// At the start of each iteration these variables are live:
		//   abcd, a0b0, d1a1
		__m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
		__m128i b0c0 = shuffle(a0b0, c0d0);
		__m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
		__m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
		__m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
		__m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
		__m128i abab = _mm_packus_epi16(daab, abbc);
		*reinterpret_cast<__m128i*>(out + 2 * x) =
			_mm_shuffle_epi32(abab, 0xd8);
		abcd         = *reinterpret_cast<const __m128i*>(in + x + 16);
		a0b0         = _mm_unpacklo_epi8(abcd, zero);
		__m128i d0a0 = shuffle(c0d0, a0b0);
		__m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
		d1a1         = _mm_mullo_epi16(c1, d0a0);
		__m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
		__m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
		__m128i cdcd = _mm_packus_epi16(bccd, cdda);
		*reinterpret_cast<__m128i*>(out + 2 * x + 16) =
			_mm_shuffle_epi32(cdcd, 0xd8);
		x += 16;
	} while (x < 0);

	// Last iteration (because this doesn't need to read new input)
	__m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
	__m128i b0c0 = shuffle(a0b0, c0d0);
	__m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
	__m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
	__m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
	__m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
	__m128i abab = _mm_packus_epi16(daab, abbc);
	*reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
	__m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
	__m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
	__m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
	__m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
	__m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
	__m128i cdcd = _mm_packus_epi16(bccd, cddd);
	*reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
}

// no SSE2 16bpp routine yet (probably not worth the effort)
void blur1on2_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
                   unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
{
	UNREACHABLE;
}

#endif

template <class Pixel>
void Simple2xScaler<Pixel>::blur1on2(
	const Pixel* __restrict pIn, Pixel* __restrict pOut,
	unsigned alpha, unsigned long srcWidth)
{
	/* This routine is functionally equivalent to the following:
	 *
	 * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
	 * {
	 *         unsigned c1 = alpha / 4;
	 *         unsigned c2 = 256 - c1;
	 *
	 *         Pixel prev, curr, next;
	 *         prev = curr = pIn[0];
	 *
	 *         unsigned x;
	 *         for (x = 0; x < (srcWidth - 1); ++x) {
	 *                 pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
	 *                 Pixel next = pIn[x + 1];
	 *                 pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
	 *                 prev = curr;
	 *                 curr = next;
	 *         }
	 *
	 *         pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
	 *         next = curr;
	 *         pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
	 * }
	 */

	if (alpha == 0) {
		Scale_1on2<Pixel> scale;
		scale(pIn, pOut, 2 * srcWidth);
		return;
	}

	assert(alpha <= 256);
	unsigned c1 = alpha / 4;
	unsigned c2 = 256 - c1;

#ifdef __SSE2__
	if (sizeof(Pixel) == 4) {
		// SSE2, only 32bpp
		blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
		return;
	}
#endif
	// C++ routine, both 16bpp and 32bpp.
	// The loop is 2x unrolled and all common subexpressions and redundant
	// assignments have been eliminated. 1 iteration generates 4 pixels.
	mult1.setFactor32(c1);
	mult2.setFactor32(c2);

	Pixel p0 = pIn[0];
	Pixel p1;
	unsigned f0 = mult1.mul32(p0);
	unsigned f1 = f0;
	unsigned tmp;

	unsigned x;
	for (x = 0; x < (srcWidth - 2); x += 2) {
		tmp = mult2.mul32(p0);
		pOut[2 * x + 0] = mult1.conv32(f1 + tmp);

		p1 = pIn[x + 1];
		f1 = mult1.mul32(p1);
		pOut[2 * x + 1] = mult1.conv32(f1 + tmp);

		tmp = mult2.mul32(p1);
		pOut[2 * x + 2] = mult1.conv32(f0 + tmp);

		p0 = pIn[x + 2];
		f0 = mult1.mul32(p0);
		pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
	}

	tmp = mult2.mul32(p0);
	pOut[2 * x + 0] = mult1.conv32(f1 + tmp);

	p1 = pIn[x + 1];
	f1 = mult1.mul32(p1);
	pOut[2 * x + 1] = mult1.conv32(f1 + tmp);

	tmp = mult2.mul32(p1);
	pOut[2 * x + 2] = mult1.conv32(f0 + tmp);

	pOut[2 * x + 3] = p1;
}

#ifdef __SSE2__

// 32bpp
void blur1on1_SSE2(const uint32_t* __restrict in_, uint32_t* __restrict out_,
                   unsigned c1_, unsigned c2_, unsigned long width)
{
	width *= sizeof(uint32_t); // in bytes
	assert(width >= (2 * sizeof(__m128i)));
	assert((reinterpret_cast<long>(in_ ) % sizeof(__m128i)) == 0);
	assert((reinterpret_cast<long>(out_) % sizeof(__m128i)) == 0);

	long x = -long(width - sizeof(__m128i));
	auto* in  = reinterpret_cast<const char*>(in_ ) - x;
	auto* out = reinterpret_cast<      char*>(out_) - x;

	// Setup first iteration
	__m128i c1 = _mm_set1_epi16(c1_);
	__m128i c2 = _mm_set1_epi16(c2_);
	__m128i zero = _mm_setzero_si128();

	__m128i abcd = *reinterpret_cast<const __m128i*>(in);
	__m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
	__m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);

	// Each iteration reads 4 pixels and generates 4 pixels
	do {
		// At the start of each iteration these variables are live:
		//   abcd, a0b0, d0a0
		__m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
		__m128i b0c0 = shuffle(a0b0, c0d0);
		__m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
		__m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
		__m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
		abcd         = *reinterpret_cast<const __m128i*>(in + x + 16);
		a0b0         = _mm_unpacklo_epi8(abcd, zero);
		d0a0         = shuffle(c0d0, a0b0);
		__m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
		__m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
		__m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
		*reinterpret_cast<__m128i*>(out + x) =
			_mm_packus_epi16(aabb, ccdd);
		x += 16;
	} while (x < 0);

	// Last iteration (because this doesn't need to read new input)
	__m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
	__m128i b0c0 = shuffle(a0b0, c0d0);
	__m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
	__m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
	__m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
	__m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
	__m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
	__m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
	__m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
	*reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
}

// no SSE2 16bpp routine yet (probably not worth the effort)
void blur1on1_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
                   unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
{
	UNREACHABLE;
}

#endif
template <class Pixel>
void Simple2xScaler<Pixel>::blur1on1(
	const Pixel* __restrict pIn, Pixel* __restrict pOut,
	unsigned alpha, unsigned long srcWidth)
{
	/* This routine is functionally equivalent to the following:
	 *
	 * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
	 * {
	 *         unsigned c1 = alpha / 4;
	 *         unsigned c2 = 256 - alpha / 2;
	 *
	 *         Pixel prev, curr, next;
	 *         prev = curr = pIn[0];
	 *
	 *         unsigned x;
	 *         for (x = 0; x < (srcWidth - 1); ++x) {
	 *                 next = pIn[x + 1];
	 *                 pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
	 *                 prev = curr;
	 *                 curr = next;
	 *         }
	 *
	 *         next = curr;
	 *         pOut[x] = c1 * prev + c2 * curr + c1 * next;
	 * }
	 */

	if (alpha == 0) {
		Scale_1on1<Pixel> copy;
		copy(pIn, pOut, srcWidth);
		return;
	}

	unsigned c1 = alpha / 4;
	unsigned c2 = 256 - alpha / 2;

#ifdef __SSE2__
	if (sizeof(Pixel) == 4) {
		// SSE2, only 32bpp
		blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
		return;
	}
#endif
	// C++ routine, both 16bpp and 32bpp.
	// The loop is 2x unrolled and all common subexpressions and redundant
	// assignments have been eliminated. 1 iteration generates 2 pixels.
	mult1.setFactor32(c1);
	mult3.setFactor32(c2);

	Pixel p0 = pIn[0];
	Pixel p1;
	unsigned f0 = mult1.mul32(p0);
	unsigned f1 = f0;

	unsigned x;
	for (x = 0; x < (srcWidth - 2); x += 2) {
		p1 = pIn[x + 1];
		unsigned t0 = mult1.mul32(p1);
		pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
		f0 = t0;

		p0 = pIn[x + 2];
		unsigned t1 = mult1.mul32(p0);
		pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
		f1 = t1;
	}

	p1 = pIn[x + 1];
	unsigned t0 = mult1.mul32(p1);
	pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);

	pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
}

template <class Pixel>
void Simple2xScaler<Pixel>::drawScanline(
		const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
		unsigned dstWidth)
{
	if (factor != 255) {
		scanline.draw(in1, in2, out, factor, dstWidth);
	} else {
		Scale_1on1<Pixel> scale;
		scale(in1, out, dstWidth);
	}
}

template <class Pixel>
void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
	unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
	ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
{
	VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
	int blur = settings.getBlurFactor();
	int scanlineFactor = settings.getScanlineFactor();

	unsigned dstY = dstStartY;
	auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
	auto* dstLine0 = dst.acquireLine(dstY + 0);
	blur1on2(srcLine, dstLine0, blur, srcWidth);

	for (/**/; dstY < dstEndY - 2; dstY += 2) {
		srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
		auto* dstLine2 = dst.acquireLine(dstY + 2);
		blur1on2(srcLine, dstLine2, blur, srcWidth);

		auto* dstLine1 = dst.acquireLine(dstY + 1);
		drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
		             2 * srcWidth);

		dst.releaseLine(dstY + 0, dstLine0);
		dst.releaseLine(dstY + 1, dstLine1);
		dstLine0 = dstLine2;
	}

	srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
	VLA_SSE_ALIGNED(Pixel, buf2, 2 * srcWidth);
	blur1on2(srcLine, buf2, blur, srcWidth);

	auto* dstLine1 = dst.acquireLine(dstY + 1);
	drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, 2 * srcWidth);
	dst.releaseLine(dstY + 0, dstLine0);
	dst.releaseLine(dstY + 1, dstLine1);
}

template <class Pixel>
void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
	unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
	ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
{
	VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
	int blur = settings.getBlurFactor();
	int scanlineFactor = settings.getScanlineFactor();

	unsigned dstY = dstStartY;
	auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
	auto* dstLine0 = dst.acquireLine(dstY);
	blur1on1(srcLine, dstLine0, blur, srcWidth);

	for (/**/; dstY < dstEndY - 2; dstY += 2) {
		srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
		auto* dstLine2 = dst.acquireLine(dstY + 2);
		blur1on1(srcLine, dstLine2, blur, srcWidth);

		auto* dstLine1 = dst.acquireLine(dstY + 1);
		drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
		             srcWidth);

		dst.releaseLine(dstY + 0, dstLine0);
		dst.releaseLine(dstY + 1, dstLine1);
		dstLine0 = dstLine2;
	}

	srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
	VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
	blur1on1(srcLine, buf2, blur, srcWidth);

	auto* dstLine1 = dst.acquireLine(dstY + 1);
	drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, srcWidth);
	dst.releaseLine(dstY + 0, dstLine0);
	dst.releaseLine(dstY + 1, dstLine1);
}

template <class Pixel>
void Simple2xScaler<Pixel>::scaleImage(
	FrameSource& src, const RawFrame* superImpose,
	unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
	ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
{
	if (superImpose) {
		// Note: this implementation is different from the openGL
		// version. Here we first alpha-blend and then scale, so the
		// video layer will also get blurred (and possibly down-scaled
		// to MSX resolution). The openGL version will only blur the
		// MSX frame, then blend with the video frame and then apply
		// scanlines. I think the openGL version is visually slightly
		// better, but much more work to implement in software (in
		// openGL shaders it's very easy). Maybe we can improve this
		// later (if required at all).
		SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
		srcWidth = sf.getLineWidth(srcStartY);
		this->dispatchScale(sf,  srcStartY, srcEndY, srcWidth,
		                    dst, dstStartY, dstEndY);
	} else {
		this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
		                    dst, dstStartY, dstEndY);
	}
}

// Force template instantiation.
#if HAVE_16BPP
template class Simple2xScaler<uint16_t>;
#endif
#if HAVE_32BPP
template class Simple2xScaler<uint32_t>;
#endif

} // namespace openmsx
