#include "MShortImage.h"

inline void MShortImage::SetSize(int x_, int y_) {
  assert((x_&7)== 0 && (y_&1)== 0); // like MCharImage for conversions
  // assert((x_&3)== 0);
  if (x_*y_> x__*y__) {
    if (x__)
      MAlign16ByteDelete((int*)Map); // delete yuv;
    Map= (unsigned short*)MAlign16ByteNew(x_*y_);
  };
  x__= x_;
  y__= y_;
  xMin_=     yMin_= 0;
  xMax_= x_; yMax_= y_;
}

MShortImage::MShortImage(int x_, int y_, int WithInit0):
                                                   x__(0), y__(0), Map(NULL)  {
  SetSize(x_,y_);
  if (WithInit0 && x__ && y__)
    for (int S= x__*y__; S; Map[--S]= 0);
}
MShortImage::~MShortImage() {
  if (x__)
    MAlign16ByteDelete((int*)Map);
}



#ifdef ARCH_X86

#else

#endif

void MShortImage::ConvertFrom(const MCharImage& Image) {
  SetSize(Image.x(), Image.y());
  /*if (MWithMmxSseUse()) {
    MsseConvertCharToShort(&Image.y_(0,0), x__*y__, Map);
    Memms(); // now FPU can be used
    } else*/ {
    unsigned short Lookup[256], *Out= Map-1;
    for (int i= 0; i< 256; i++)
      Lookup[i]= (unsigned short)(i);
    const unsigned char* In= &Image.y_(0,0)-1;
    for (int S= x__*y__; S; S--)
      Out[S]= Lookup[In[S]];
  };
}

const MShortImage& MShortImage::operator=(const MShortImage& Image) {
  SetSize(Image.x__,Image.y__);
  xMin_= Image.xMin_; yMin_= Image.yMin_; 
  xMax_= Image.xMax_; yMax_= Image.yMax_; 
  memcpy((char*)Map, (char*)Image.Map, x__*y__*sizeof(unsigned short));
  return *this;
}

// ******************************* 1 2 1 smooth *******************************

#ifdef ARCH_X86
static inline void Mmmx121ySmooth0(void)
{
  static mmx_t mmx_0000=  {0x0000000000000000};
  movq_m2r(mmx_0000,mm6);
}
static inline void Mmmx121ySmooth1(const unsigned short* In,unsigned short* Out,int xStride)
{
  movq_m2r(*(In-xStride),mm0);
  movq_m2r(*In,mm1);
  movq_m2r(*(In+xStride),mm2);
  pavgw_r2r(mm0,mm2);
  pavgw_r2r(mm1,mm2);
  movq_r2m(mm2,*Out);
}

static inline void Mmmx121ySmooth(const unsigned short* In, int xStride, int x, int y,
				  unsigned short* Out)
{
  unsigned short* PtOut;
  const unsigned short* PtIn;
  Mmmx121ySmooth0();
  for (int yy=1; yy<y-1 ; yy++)
    {
      PtOut=Out+yy*xStride;
      PtIn=In+yy*xStride;
      for (int xx=0;xx<x;xx+=4)
	{
	  Mmmx121ySmooth1(PtIn,PtOut,xStride);
	  PtIn+=4;
	  PtOut+=4;
	}
    } 
}

static inline void Mmmx121xSmooth0(void)
{
  static mmx_t mmx_0000=  {0x0000000000000000ull};
  movq_m2r(mmx_0000,mm6);
}
static inline void Mmmx121xSmooth1(const unsigned short* In, unsigned short* Out)
{
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};

  movq_m2r(*In,mm0);          // mm0= i3 i2 i1 i0
  movq_r2r(mm0,mm1);          // mm1= i3 i2 i1 i0
  movq_r2r(mm0,mm2);          // mm2= i3 i2 i1 i0
  
  pshufw_r2r(mm0,mm0,0x39);   // mm0= i0 i3 i2 i1
  movq_m2r(*(In+4),mm3);      // mm3= h3 h2 h1 h0
  pand_m2r(mmx_0FFF,mm0);     // mm0= 00 i3 i2 i1
  pand_m2r(mmx_000F,mm3);     // mm3= 00 00 00 h0
  pshufw_r2r(mm3,mm3,0x39);   // mm3= h0 00 00 00
  paddusw_r2r(mm3,mm0);       // mm0= h0 i3 i2 i1  Ok pour mm0

  pshufw_r2r(mm2,mm2,0x93);   // mm2= i2 i1 i0 i3
  pand_m2r(mmx_FFF0,mm2);     // mm2= i2 i1 i0 00
  movq_m2r(*(In-4),mm3);      // mm3= j3 j2 j1 j0
  pand_m2r(mmx_F000,mm3);     // mm3= j3 00 00 00
  pshufw_r2r(mm3,mm3,0x93);   // mm3= 00 00 00 j3
  paddusw_r2r(mm3,mm2);       // mm2= i2 i1 i0 j3  Ok pour mm2
  
  pavgw_r2r(mm2,mm0);
  pavgw_r2r(mm1,mm0);

  movq_r2m(mm0,*Out);
}
static inline void Mmmx121xSmooth2(const unsigned short* In, unsigned short* Out)
{
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};

  movq_m2r(*In,mm0);          // mm0= i3 i2 i1 i0
  movq_r2r(mm0,mm1);          // mm1= i3 i2 i1 i0
  movq_r2r(mm0,mm2);          // mm2= i3 i2 i1 i0
  
  pshufw_r2r(mm0,mm0,0x39);   // mm0= i0 i3 i2 i1
  movq_m2r(*(In+4),mm3);      // mm3= h3 h2 h1 h0
  pand_m2r(mmx_0FFF,mm0);     // mm0= 00 i3 i2 i1
  pand_m2r(mmx_000F,mm3);     // mm3= 00 00 00 h0
  pshufw_r2r(mm3,mm3,0x39);   // mm3= h0 00 00 00
  paddusw_r2r(mm3,mm0);       // mm0= h0 i3 i2 i1  Ok pour mm0

  pshufw_r2r(mm2,mm2,0x93);   // mm2= i2 i1 i0 i3
  pand_m2r(mmx_FFF0,mm2);     // mm2= i2 i1 i0 00
  
  pavgw_r2r(mm2,mm0);
  pavgw_r2r(mm1,mm0);

  movq_r2m(mm0,*Out);
}
static inline void Mmmx121xSmooth3(const unsigned short* In, unsigned short* Out)
{
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};

  movq_m2r(*In,mm0);          // mm0= i3 i2 i1 i0
  movq_r2r(mm0,mm1);          // mm1= i3 i2 i1 i0
  movq_r2r(mm0,mm2);          // mm2= i3 i2 i1 i0
  
  pshufw_r2r(mm0,mm0,0x39);   // mm0= i0 i3 i2 i1
  pand_m2r(mmx_0FFF,mm0);     // mm0= 00 i3 i2 i1


  pshufw_r2r(mm2,mm2,0x93);   // mm2= i2 i1 i0 i3
  pand_m2r(mmx_FFF0,mm2);     // mm2= i2 i1 i0 00
  movq_m2r(*(In-4),mm3);      // mm3= j3 j2 j1 j0
  pand_m2r(mmx_F000,mm3);     // mm3= j3 00 00 00
  pshufw_r2r(mm3,mm3,0x93);   // mm3= 00 00 00 j3
  paddusw_r2r(mm3,mm2);       // mm2= i2 i1 i0 j3  Ok pour mm2
  
  pavgw_r2r(mm2,mm0);
  pavgw_r2r(mm1,mm0);

  movq_r2m(mm0,*Out);
}
static inline void Mmmx121xSmooth(const unsigned short* In, int xStride, int x, int y,
				  unsigned short* Out)
{
  unsigned short* PtOut;
  const unsigned short* PtIn;
  Mmmx121xSmooth0();
  for (int yy=0; yy<y ; yy++)
    {
      PtOut=Out+yy*xStride;
      PtIn=In+yy*xStride;
      //filtrage des 4 premiers pixels de la ligne
      Mmmx121xSmooth2(PtIn,PtOut);
      PtIn+=4;
      PtOut+=4;
      for (int xx=4;xx<x-4;xx+=4)
	{
	  Mmmx121xSmooth1(PtIn,PtOut);
	  PtIn+=4;
	  PtOut+=4;
	}
      //filtrage des 4 derniers pixels de la ligne
      Mmmx121xSmooth3(PtIn,PtOut);
    }  
}

#else 
static inline void Mmmx121ySmooth(const unsigned short* In, int xStride, int x, int y,
				  unsigned short* Out) {}
static inline void Mmmx121xSmooth(const unsigned short* In, int xStride, int x, int y,
				  unsigned short* Out) {}
#endif

void MShortImage::Smooth121(int IsHorizontal0Vertical1,
			    const MShortImage& Data) {
  SetSize(Data.x__, Data.y__);
  if (IsHorizontal0Vertical1) {
    xMin_= Data.xMin_;   xMax_= Data.xMax_;
    yMin_= Data.yMin_+1; yMax_= Data.yMax_-1; 
    if (MWithMmxSseUse())
      {
	Mmmx121ySmooth(Data.Map, x__, x__, y__, Map);
	Memms();
      }
      else
      for (int xx= 0; xx< x__; xx++) {
	unsigned short *PteIn= Data.Map+xx, *PteOut= Map+xx+x__,
	  a= PteIn[0], b= PteIn[x__];
	PteIn+= (x__+x__);
	for (int yy= y__-2; yy; yy--) {
	  unsigned short c= *PteIn;
	  *PteOut= (a+c+b+b)/4;
	  PteIn+=  x__;
	  PteOut+= x__;
	  a= b; b= c;
	};
      };
  } else {
    xMin_= Data.xMin_+1; xMax_= Data.xMax_-1;
    yMin_= Data.yMin_;   yMax_= Data.yMax_; 
    if (MWithMmxSseUse())
      {
        Mmmx121xSmooth(Data.Map, x__, x__, y__, Map);
	Memms();
      }
      else
      for (int yy= 0; yy< y__; yy++) {
	unsigned short *PteIn= Data.Map+yy*x__+1, *PteOut= Map+yy*x__,
	  a= 0, b= PteIn[-1];
	for (int xx= 0; xx< x__; xx++) {
	  unsigned short c= PteIn[xx];
	  PteOut[xx]= (a+c+b+b)/4;
	  a= b; b= c;
	};
      };
  };
}

//------------------------------------------------------------------------------------
// CharProduct
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MmmxCharProduct0()
{
  static mmx_t mmx_0000=  {0x0000000000000000ull};
  movq_m2r(mmx_0000,mm6);
}

static inline void MmmxCharProduct1(const unsigned char* In1, const unsigned char* In2,
				   unsigned short* Out)
{
  static mmx_t mmx_FFFF=  {0x7FFF7FFF7FFF7FFFull};

  movq_m2r(*In1,mm0);        // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm1);
  punpckhbw_r2r(mm6,mm0);    // mm0= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm1);    // mm1= 00 i3 00 i2 00 i1 00 i0

  movq_m2r(*In2,mm2);        // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  movq_r2r(mm2,mm3);
  punpckhbw_r2r(mm6,mm2);    // mm2= 00 j7 00 j6 00 j5 00 j4
  punpcklbw_r2r(mm6,mm3);    // mm3= 00 j3 00 j2 00 j1 00 j0

  pmullw_r2r(mm0,mm2);      // mm2= i7*j7 i6*j6 i5*j5 i4*j4
  movq_r2m(mm2,*(Out+4));   // Out est un (unsigned short *) donc +4 donne 8 octets de decalage
  pmullw_r2r(mm1,mm3);      // mm3= i3*j3 i2*j2 i1*j1 i0*j0
  movq_r2m(mm3,*(Out));

}

static inline void MmmxCharProduct(const unsigned char* In1, const unsigned char* In2,
				   unsigned short* Out, int size)
{
  unsigned short* PtOut;
  unsigned short* fin=Out+size;
  MmmxCharProduct0();
  for (PtOut=Out; PtOut<fin ; PtOut+=8)
    {
      MmmxCharProduct1(In1,In2,PtOut);
      In1+=8;
      In2+=8;
    }
}

#else 
static inline void MmmxCharProduct(const unsigned char* In1, const unsigned char* In2,
				   unsigned short* Out, int size) {}

#endif

void MShortImage::CharProduct(const MCharImage& Image1,const MCharImage& Image2)
{
  assert( (Image1.x()==Image2.x()) && (Image1.y()==Image2.y()));

  SetSize(Image1.x(), Image1.y());
  if (MWithMmxSseUse())
    {
      MmmxCharProduct(&Image1.y_(0,0), &Image2.y_(0,0), Map, x__*y__*2);
      Memms();
    }
    else
    {
      unsigned short Lookup[256];
      unsigned short *Out= Map;
      for (int i= 0; i< 256; i++)
	Lookup[i]= (unsigned short)(i);
      const unsigned char* In1= &Image1.y_(0,0);
      const unsigned char* In2= &Image2.y_(0,0);
      for (int S= x__*y__; S; S--)
	Out[S]= Lookup[In1[S]]*Lookup[In2[S]];
    }
}

