#include "MSignedShortImage.h"

inline void MSignedShortImage::SetSize(int x_, int y_) {
  assert((x_&7)== 0 && (y_&1)== 0); // like MCharImage for conversions
  // assert((x_&3)== 0);
  if (x_*y_> x__*y__) {
    if (x__)
      MAlign16ByteDelete((int*)Map); // delete yuv;
    Map= (signed short*)MAlign16ByteNew(x_*y_);
  };
  x__= x_;
  y__= y_;
  xMin_=     yMin_= 0;
  xMax_= x_; yMax_= y_;
}

MSignedShortImage::MSignedShortImage(int x_, int y_, int WithInit0):
                                                   x__(0), y__(0), Map(NULL)  {
  SetSize(x_,y_);
  if (WithInit0 && x__ && y__)
    for (int S= x__*y__; S; Map[--S]= 0);
}
MSignedShortImage::~MSignedShortImage() {
  if (x__)
    MAlign16ByteDelete((int*)Map);
}


const MSignedShortImage& MSignedShortImage::operator=(const MSignedShortImage& Image) {
  SetSize(Image.x__,Image.y__);
  xMin_= Image.xMin_; yMin_= Image.yMin_; 
  xMax_= Image.xMax_; yMax_= Image.yMax_; 
  memcpy((char*)Map, (char*)Image.Map, x__*y__*sizeof(signed short));
  return *this;
}



//Remplissage
void MSignedShortImage::Fill(signed short GrayLevel)
{
	for (int i=0;i<x__*y__;i++)
	{
		Map[i]=GrayLevel;
	}
}

//------------------------------------------------------------------------------------
// ConvertFrom(MCharImage&)
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MmmxConvertFromChar0()
{
	pxor_r2r(mm7,mm7);
}
static inline void MmmxConvertFromChar1(const unsigned char* In,signed short* Out)
{
	movq_m2r(*In,mm0);
	movq_r2r(mm0,mm1);
	punpckhbw_r2r(mm7,mm0);
	punpcklbw_r2r(mm7,mm1);
	movq_r2m(mm1,*Out);
	movq_r2m(mm0,*(Out+4));		
}
#else
static inline void MmmxConvertFromChar0(){}
static inline void MmmxConvertFromChar0(const unsigned char* In,signed short* Out){}
#endif
void MSignedShortImage::ConvertFrom(const MCharImage& Image)
{
  SetSize(Image.x(),Image.y());
	if (MWithMmxSseUse())
	{
    const unsigned char* PteIn=&(Image.y_(0,0));
    signed short* PteOut=Map;
		MmmxConvertFromChar0();
    int fin=x__*y__;
    for (int i=0;i<fin;i+=8)
    {
			MmmxConvertFromChar1(PteIn,PteOut); //Conversion de 8 pixels
			PteIn+=8;
			PteOut+=8;
	  }		
    Memms();
	}
	else
	{
    const unsigned char* PteIn=&(Image.y_(0,0));
    signed short* PteOut=Map;
    for (int i=0;i<x__*y__;i++)
    {
			*PteOut=short(*PteIn);
			PteIn++;
			PteOut++;
	  }
	}	
}
//------------------------------------------------------------------------------------
// Substract
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MmmxSubstract1(const signed short* In,signed short* Out)
{
	movq_m2r(*In,mm0);
	movq_m2r(*Out,mm1);
	psubsw_r2r(mm0,mm1);
	movq_r2m(mm1,*Out);
}
#else
#endif
void MSignedShortImage::Substract(const MSignedShortImage& Image)
{
	if (MWithMmxSseUse())
	{
    const signed short* PteIn=&(Image.M_(0,0));
    signed short* PteOut=Map;
    for (int i=0;i<x__*y__;i+=4)
    {
			MmmxSubstract1(PteIn,PteOut);
			PteIn+=4;
			PteOut+=4;
	  }		
    Memms();
	}
	else
	{
    const signed short* PteIn=&(Image.M_(0,0));
    signed short* PteOut=Map;
    for (int i=0;i<x__*y__;i++)
    {
			*PteOut=(*PteOut)-(*PteIn);
			PteIn++;
			PteOut++;
	  }
	}
}

//------------------------------------------------------------------------------------
// 1 2 1 smooth 
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
//version MMX 121y
static inline void Mmmx121ySignedSmooth0(void)
{
  static mmx_t mmx_0000=  {0x0000000000000000ull};
  static mmx_t mmx_0001=  {0x0000000000000001ull};
  movq_m2r(mmx_0000,mm6);
  movq_m2r(mmx_0001,mm7);

}
static inline void Mmmx121ySignedSmooth1(const signed short* In,signed short* Out,int xStride)
{
  movq_m2r(*(In-xStride),mm0); //mm0=a
  movq_m2r(*In,mm1);           //mm1=b
  movq_m2r(*(In+xStride),mm2); //mm2=c
  //A calculer 1/4*(a+2*b+c)
  psraw_r2r(mm7,mm0);   //mm0=a/2
  psraw_r2r(mm7,mm2);   //mm2=c/2
  paddsw_r2r(mm0,mm2);  //mm2=(a+c)/2
  psraw_r2r(mm7,mm2);   //mm2=(a+c)/4
  psraw_r2r(mm7,mm1);   //mm1=b/2
  paddsw_r2r(mm1,mm2);  //resultat dans mm2
  movq_r2m(mm2,*Out);
}
static inline void Mmmx121ySignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out)
{
  signed short* PtOut;
  const signed short* PtIn;
  Mmmx121ySignedSmooth0();
  for (int yy=1; yy<y-1 ; yy++)
  {
    PtOut=Out+yy*xStride;
    PtIn=In+yy*xStride;
    for (int xx=0;xx<x;xx+=4)
    {
			Mmmx121ySignedSmooth1(PtIn,PtOut,xStride);
			PtIn+=4;
			PtOut+=4;
		}
  } 
}
//Version SSE2 121y
static inline void Msse2_121ySignedSmooth1(const signed short* In,signed short* Out,int xStride)
{
  movdqa_m2r(*(In),xmm0); //xmm0=a
  movdqa_m2r(*(In+xStride),xmm1);           //xmm1=b
  movdqa_m2r(*(In+xStride+xStride),xmm2); //xmm2=c
  //A calculer 1/4*(a+2*b+c)
  paddsw_r2r(xmm0,xmm2);	//xmm2=a+c
  psraw_i2r(2,xmm2);			//xmm2=(a+c)/4
  psraw_i2r(1,xmm1);			//xmm1=b/2
  paddsw_r2r(xmm1,xmm2);	//resultat dans xmm2
  movdqa_r2m(xmm2,*Out);
}
static inline void Msse2_121ySignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out)
{
  signed short* PtOut;
  const signed short* PtIn;
  for (int yy=1; yy<y-1 ; yy++)
  {
    PtOut=Out+yy*xStride;
    PtIn=In+(yy-1)*xStride;
    for (int xx=0;xx<x;xx+=8)
    {
			Msse2_121ySignedSmooth1(PtIn,PtOut,xStride);
			PtIn+=8;
			PtOut+=8;
		}
  }
}
static inline void Msse2_121ySignedSmoothBis0(const signed short* In,signed short* Out,int xStride)
{
	movdqa_m2r(*In,xmm0);
	movdqa_m2r(*(In+xStride),xmm1);	
}
static inline void Msse2_121ySignedSmoothBis1(const signed short* In,signed short* Out,int xStride)
{
	//xmm0=a
	//xmm1=b
  movdqa_m2r(*In,xmm2); //xmm2=c
  //A calculer 1/4*(a+2*b+c)
  paddsw_r2r(xmm2,xmm0);	//xmm0=a+c
  psraw_i2r(2,xmm0);			//xmm0=(a+c)/4
  movdqa_r2r(xmm1,xmm7);	//xmm7=b
  psraw_i2r(1,xmm7);			//xmm7=b/2
  paddsw_r2r(xmm7,xmm0);	//resultat dans xmm0
  movdqa_r2m(xmm0,*Out);
  //preparation des registres pour la prochaine iteration
  movdqa_r2r(xmm1,xmm0);
  movdqa_r2r(xmm2,xmm1);
}
static inline void Msse2_121ySignedSmoothBis(const signed short* In, int xStride, int x, int y,
				  signed short* Out)
{
  signed short* PtOut;
  const signed short* PtIn;
  for (int xx=0;xx<x;xx+=8)
  {
    PtOut=Out+xx;
    PtIn=In+xx;
    Msse2_121ySignedSmoothBis0(PtIn,PtOut,xStride);
    PtIn+=xStride;
    PtIn+=xStride;
    PtOut+=xStride;        
    for (int yy=1; yy<y-1 ; yy++)
    {
			Msse2_121ySignedSmoothBis1(PtIn,PtOut,xStride);
			PtIn+=xStride;
			PtOut+=xStride;
		}
  }
}

//Version MMX 121x
static inline void Mmmx121xSignedSmooth0(void)
{
  static mmx_t mmx_0000=  {0x0000000000000000ull};
  static mmx_t mmx_0001=  {0x0000000000000001ull};
  movq_m2r(mmx_0000,mm6);
  movq_m2r(mmx_0001,mm7);
}
static inline void Mmmx121xSignedSmooth1(const signed short* In, signed short* Out)
{
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};

  movq_m2r(*In,mm0);          // mm0= i3 i2 i1 i0
  movq_r2r(mm0,mm1);          // mm1= i3 i2 i1 i0
  movq_r2r(mm0,mm2);          // mm2= i3 i2 i1 i0
  
  pshufw_r2r(mm0,mm0,0x39);   // mm0= i0 i3 i2 i1
  movq_m2r(*(In+4),mm3);      // mm3= h3 h2 h1 h0
  pand_m2r(mmx_0FFF,mm0);     // mm0= 00 i3 i2 i1
  pand_m2r(mmx_000F,mm3);     // mm3= 00 00 00 h0
  pshufw_r2r(mm3,mm3,0x39);   // mm3= h0 00 00 00
  paddusw_r2r(mm3,mm0);       // mm0= h0 i3 i2 i1  Ok pour mm0

  pshufw_r2r(mm2,mm2,0x93);   // mm2= i2 i1 i0 i3
  pand_m2r(mmx_FFF0,mm2);     // mm2= i2 i1 i0 00
  movq_m2r(*(In-4),mm3);      // mm3= j3 j2 j1 j0
  pand_m2r(mmx_F000,mm3);     // mm3= j3 00 00 00
  pshufw_r2r(mm3,mm3,0x93);   // mm3= 00 00 00 j3
  paddusw_r2r(mm3,mm2);       // mm2= i2 i1 i0 j3  Ok pour mm2
  
  //A calculer 1/4*(mm0+2*mm1+mm2)
  psraw_r2r(mm7,mm0);   //mm0=a/2
  psraw_r2r(mm7,mm2);   //mm2=c/2
  paddsw_r2r(mm0,mm2);  //mm2=(a+c)/2
  psraw_r2r(mm7,mm2);   //mm2=(a+c)/4
  psraw_r2r(mm7,mm1);   //mm1=b/2
  paddsw_r2r(mm1,mm2);  //resultat dans mm2
  movq_r2m(mm2,*Out);  
}
static inline void Mmmx121xSignedSmooth2(const signed short* In, signed short* Out)
{
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};

  movq_m2r(*In,mm0);          // mm0= i3 i2 i1 i0
  movq_r2r(mm0,mm1);          // mm1= i3 i2 i1 i0
  movq_r2r(mm0,mm2);          // mm2= i3 i2 i1 i0
  
  pshufw_r2r(mm0,mm0,0x39);   // mm0= i0 i3 i2 i1
  movq_m2r(*(In+4),mm3);      // mm3= h3 h2 h1 h0
  pand_m2r(mmx_0FFF,mm0);     // mm0= 00 i3 i2 i1
  pand_m2r(mmx_000F,mm3);     // mm3= 00 00 00 h0
  pshufw_r2r(mm3,mm3,0x39);   // mm3= h0 00 00 00
  paddusw_r2r(mm3,mm0);       // mm0= h0 i3 i2 i1  Ok pour mm0

  pshufw_r2r(mm2,mm2,0x93);   // mm2= i2 i1 i0 i3
  pand_m2r(mmx_FFF0,mm2);     // mm2= i2 i1 i0 00

  //A calculer 1/4*(mm0+2*mm1+mm2)
  psraw_r2r(mm7,mm0);   //mm0=a/2
  psraw_r2r(mm7,mm2);   //mm2=c/2
  paddsw_r2r(mm0,mm2);  //mm2=(a+c)/2
  psraw_r2r(mm7,mm2);   //mm2=(a+c)/4
  psraw_r2r(mm7,mm1);   //mm1=b/2
  paddsw_r2r(mm1,mm2);  //resultat dans mm2
  movq_r2m(mm2,*Out);  
}
static inline void Mmmx121xSignedSmooth3(const signed short* In, signed short* Out)
{
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};

  movq_m2r(*In,mm0);          // mm0= i3 i2 i1 i0
  movq_r2r(mm0,mm1);          // mm1= i3 i2 i1 i0
  movq_r2r(mm0,mm2);          // mm2= i3 i2 i1 i0
  
  pshufw_r2r(mm0,mm0,0x39);   // mm0= i0 i3 i2 i1
  pand_m2r(mmx_0FFF,mm0);     // mm0= 00 i3 i2 i1


  pshufw_r2r(mm2,mm2,0x93);   // mm2= i2 i1 i0 i3
  pand_m2r(mmx_FFF0,mm2);     // mm2= i2 i1 i0 00
  movq_m2r(*(In-4),mm3);      // mm3= j3 j2 j1 j0
  pand_m2r(mmx_F000,mm3);     // mm3= j3 00 00 00
  pshufw_r2r(mm3,mm3,0x93);   // mm3= 00 00 00 j3
  paddusw_r2r(mm3,mm2);       // mm2= i2 i1 i0 j3  Ok pour mm2
  //A calculer 1/4*(mm0+2*mm1+mm2)
  psraw_r2r(mm7,mm0);   //mm0=a/2
  psraw_r2r(mm7,mm2);   //mm2=c/2
  paddsw_r2r(mm0,mm2);  //mm2=(a+c)/2
  psraw_r2r(mm7,mm2);   //mm2=(a+c)/4
  psraw_r2r(mm7,mm1);   //mm1=b/2
  paddsw_r2r(mm1,mm2);  //resultat dans mm2
  movq_r2m(mm2,*Out);
}
static inline void Mmmx121xSignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out)
{
  signed short* PtOut;
  const signed short* PtIn;
  Mmmx121xSignedSmooth0();
  for (int yy=0; yy<y ; yy++)
  {
		PtOut=Out+yy*xStride;
    PtIn=In+yy*xStride;
    //filtrage des 4 premiers pixels de la ligne
    Mmmx121xSignedSmooth2(PtIn,PtOut);
    PtIn+=4;
    PtOut+=4;
    for (int xx=4;xx<x-4;xx+=4)
    {
			Mmmx121xSignedSmooth1(PtIn,PtOut);
			PtIn+=4;
			PtOut+=4;
		}
		//filtrage des 4 derniers pixels de la ligne
		Mmmx121xSignedSmooth3(PtIn,PtOut);
  }  
}
//Version SSE2 121x
static inline void Msse2_121xSignedSmooth1(const signed short* In, signed short* Out)
{
  movdqa_m2r(*In,xmm0);          // xmm0= i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm1);         // xmm1= i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm2);         // xmm2= i7 i6 i5 i4 i3 i2 i1 i0

  pslldq_i2r(2,xmm0);            // xmm0= 00 i7 i6 i5 i4 i3 i2 i0
  movdqa_m2r(*(In-8),xmm3);      // xmm3= h7 h6 h5 h4 h3 h2 h1 h0
  psrldq_i2r(14,xmm3);					 // xmm3= h0 00 00 00 00 00 00 00
  paddusw_r2r(xmm3,xmm0);        // xmm0= h0 i7 i6 i5 i4 i3 i2 i1  Ok pour mm0

  psrldq_i2r(2,xmm2);						 // xmm2= i6 i5 i4 i3 i2 i1 i0 00
  movdqa_m2r(*(In+8),xmm3);      // xmm3= j7 j6 j5 j4 j3 j2 j1 j0
  pslldq_i2r(14,xmm3);					 // xmm3= 00 00 00 00 00 00 00 j7
  paddusw_r2r(xmm3,xmm2);        // xmm2= i6 i5 i4 i3 i2 i1 i0 j7  Ok pour mm2

  //A calculer 1/4*(mm0+2*mm1+mm2)
  paddsw_r2r(xmm0,xmm2);	//xmm2=a+c
  psraw_i2r(2,xmm2);			//xmm2=(a+c)/4
  psraw_i2r(1,xmm1);			//xmm1=b/2
  paddsw_r2r(xmm1,xmm2);	//xmm2=b/2+(a+c)/4    
  movdqa_r2m(xmm2,*Out);  
}
static inline void Msse2_121xSignedSmooth2(const signed short* In, signed short* Out)
{
  movdqa_m2r(*In,xmm0);          // xmm0= i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm1);         // xmm1= i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm2);         // xmm2= i7 i6 i5 i4 i3 i2 i1 i0

  pslldq_i2r(2,xmm0);            // xmm0= 00 i7 i6 i5 i4 i3 i2 i0

  psrldq_i2r(2,xmm2);						 // xmm2= i6 i5 i4 i3 i2 i1 i0 00
  movdqa_m2r(*(In+8),xmm3);      // xmm3= j7 j6 j5 j4 j3 j2 j1 j0
  pslldq_i2r(14,xmm3);					 // xmm3= 00 00 00 00 00 00 00 j7
  paddusw_r2r(xmm3,xmm2);        // xmm2= i6 i5 i4 i3 i2 i1 i0 j7  Ok pour mm2

  //A calculer 1/4*(mm0+2*mm1+mm2)
  paddsw_r2r(xmm0,xmm2);	//xmm2=a+c
  psraw_i2r(2,xmm2);			//xmm2=(a+c)/4
  psraw_i2r(1,xmm1);			//xmm1=b/2
  paddsw_r2r(xmm1,xmm2);	//xmm2=b/2+(a+c)/4
  movdqa_r2m(xmm2,*Out);
}
static inline void Msse2_121xSignedSmooth3(const signed short* In, signed short* Out)
{
  movdqa_m2r(*In,xmm0);          // xmm0= i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm1);         // xmm1= i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm2);         // xmm2= i7 i6 i5 i4 i3 i2 i1 i0

  pslldq_i2r(2,xmm0);            // xmm0= 00 i7 i6 i5 i4 i3 i2 i0
  movdqa_m2r(*(In-8),xmm3);      // xmm3= h7 h6 h5 h4 h3 h2 h1 h0
  psrldq_i2r(14,xmm3);					 // xmm3= h0 00 00 00 00 00 00 00
  paddusw_r2r(xmm3,xmm0);        // xmm0= h0 i7 i6 i5 i4 i3 i2 i1  Ok pour mm0

  psrldq_i2r(2,xmm2);						 // xmm2= i6 i5 i4 i3 i2 i1 i0 00

  //A calculer 1/4*(mm0+2*mm1+mm2)
  paddsw_r2r(xmm0,xmm2);	//xmm2=a+c
  psraw_i2r(2,xmm2);			//xmm2=(a+c)/4
  psraw_i2r(1,xmm1);			//xmm1=b/2
  paddsw_r2r(xmm1,xmm2);	//xmm2=b/2+(a+c)/4
  movdqa_r2m(xmm2,*Out);
}
static inline void Msse2_121xSignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out)
{
  signed short* PtOut;
  const signed short* PtIn;
  for (int yy=0; yy<y ; yy++)
  {
		PtOut=Out+yy*xStride;
    PtIn=In+yy*xStride;
    //filtrage des 8 premiers pixels de la ligne
    Msse2_121xSignedSmooth2(PtIn,PtOut);
    PtIn+=8;
    PtOut+=8;
    for (int xx=8;xx<x-8;xx+=8)
    {
			Msse2_121xSignedSmooth1(PtIn,PtOut);
			PtIn+=8;
			PtOut+=8;
		}
		//filtrage des 8 derniers pixels de la ligne
		Msse2_121xSignedSmooth3(PtIn,PtOut);
  }
}

#else 
static inline void Mmmx121ySignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out) {}
static inline void Msse2_121ySignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out) {}
static inline void Mmmx121xSignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out) {}
static inline void Msse2_121xSignedSmooth(const signed short* In, int xStride, int x, int y,
				  signed short* Out) {}
#endif

void MSignedShortImage::Smooth121(int IsHorizontal0Vertical1,
			    const MSignedShortImage& Data)
{
  SetSize(Data.x__, Data.y__);
  if (IsHorizontal0Vertical1)
  {
    //Lissage en y
    xMin_= Data.xMin_;   xMax_= Data.xMax_;
    yMin_= Data.yMin_+1; yMax_= Data.yMax_-1; 
    switch (MWithMmxSseUse())
    {
			case 2:
			{
				Msse2_121ySignedSmooth(Data.Map, x__, x__, y__, Map);
          //#define __TESTSIGNEDSMOOTHY__
          #ifdef __TESTSIGNEDSMOOTHY__
					MSignedShortImage Verif(x__,y__);
					for (int xx= 0; xx< x__; xx++)
					{
						signed short *PteIn= Data.Map+xx, *PteOut= Verif.Map+xx+x__,
						a= PteIn[0], b= PteIn[x__];
						PteIn+= (x__+x__);
						for (int yy= y__-2; yy; yy--)
						{
							signed short c= *PteIn;
							*PteOut= (a+c+b+b)/4;
							PteIn+=  x__;
							PteOut+= x__;
							a= b; b= c;
						};
					};
					cout << "verif" << endl;
					for (int i=0;i<x__;i++)
					{
						for (int j=1;j<y__-1;j++)
						{
							if (abs(float(Map[i+j*x__])-float(Verif.Map[i+j*x__]))>2.0) cout << int(Map[i+j*x__]) << " = " << int(Verif.Map[i+j*x__]) << " (" << i << "," << j << ")   " << flush;
						}
					}
					cout << "finverif" << endl;
				  #endif
				break;
			}
			case 1:
			{
				Mmmx121ySignedSmooth(Data.Map, x__, x__, y__, Map);
				Memms();
				break;
			}
			case 0:
			{
				for (int xx= 0; xx< x__; xx++)
				{
					signed short *PteIn= Data.Map+xx, *PteOut= Map+xx+x__,
					a= PteIn[0], b= PteIn[x__];
					PteIn+= (x__+x__);
					for (int yy= y__-2; yy; yy--)
					{
						signed short c= *PteIn;
						*PteOut= (a+c+b+b)/4;
						PteIn+=  x__;
						PteOut+= x__;
						a= b; b= c;
					};
				};
				break;
			}
		}
  }
  else
  {
		//Lissage en x
    xMin_= Data.xMin_+1; xMax_= Data.xMax_-1;
    yMin_= Data.yMin_;   yMax_= Data.yMax_; 
    switch (MWithMmxSseUse())
    {
			case 2:
			{
				Msse2_121xSignedSmooth(Data.Map, x__, x__, y__, Map);
          //#define __TESTSIGNEDSMOOTHX__
          #ifdef __TESTSIGNEDSMOOTHX__
					MSignedShortImage Verif(x__,y__);
					for (int yy= 0; yy< y__; yy++)
					{
						signed short *PteIn= Data.Map+yy*x__+1, *PteOut= Verif.Map+yy*x__,
						a= 0, b= PteIn[-1];
						for (int xx= 0; xx< x__; xx++)
						{
							signed short c= PteIn[xx];
							PteOut[xx]= (a+c+b+b)/4;
							a= b; b= c;
						};
					};
					cout << "verif" << endl;
					for (int i=1;i<x__-1;i++)
					{
						for (int j=0;j<y__;j++)
						{
							if (abs(float(Map[i+j*x__])-float(Verif.Map[i+j*x__]))>2.0) cout << int(Map[i+j*x__]) << " = " << int(Verif.Map[i+j*x__]) << " (" << i << "," << j << ")" << endl;
						}
					}
					cout << "finverif" << endl;
					#endif
				break;
			}
			case 1:
			{
				Mmmx121xSignedSmooth(Data.Map, x__, x__, y__, Map);
				Memms();
				break;
			}
			case 0:
			{
				for (int yy= 0; yy< y__; yy++)
				{
					signed short *PteIn= Data.Map+yy*x__+1, *PteOut= Map+yy*x__,
					a= 0, b= PteIn[-1];
					for (int xx= 0; xx< x__; xx++)
					{
						signed short c= PteIn[xx];
						PteOut[xx]= (a+c+b+b)/4;
						a= b; b= c;
					};
				};
				break;
			};
		}//fin switch
		
	}
}




//------------------------------------------------------------------------------------
// CharProduct
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
//Version MMX
static inline void MmmxSignedCharProduct0()
{
  static mmx_t mmx_0000=  {0x0000000000000000ull};
  static mmx_t mmx_mask=  {0x007F007F007F007Full};//{0x807F807F807F807F};
  static mmx_t mmx_msksgn={0x8000800080008000ull};
  movq_m2r(mmx_0000,mm6);
  movq_m2r(mmx_mask,mm7);
  movq_m2r(mmx_msksgn,mm5);
}

static inline void MmmxSignedCharProduct1(const signed char* In1, const signed char* In2,
				   signed short* Out)
{
  static mmx_t mmx_128w={0x007F007F007F007Full};

  //premier paquet de 8 octets
  movq_m2r(*In1,mm0);        // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm1);
  //Conversion des char en signed short
  //et soustraction de 127
  punpckhbw_r2r(mm6,mm0);    // mm0= 00 i7 00 i6 00 i5 00 i4
  psubsw_m2r(mmx_128w,mm0);   // mm0= 00i7 00i6 00i5 00i4 (valeurs en signed short centrées en 0)  
  punpcklbw_r2r(mm6,mm1);    // mm1= 00 i3 00 i2 00 i1 00 i0
  psubsw_m2r(mmx_128w,mm1);   // mm1= 00i3 00i2 00i1 00i0 (valeurs en signed short centrées en 0)
  
  //deuxieme paquet de 8 octets
  movq_m2r(*In2,mm2);        // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  movq_r2r(mm2,mm3);
  punpckhbw_r2r(mm6,mm2);    // mm2= 00 j7 00 j6 00 j5 00 j4
  psubsw_m2r(mmx_128w,mm2);  // mm2= 00j7 00j6 00j5 00j4 (valeurs en signed short centrées en 0)
  punpcklbw_r2r(mm6,mm3);    // mm3= 00 j3 00 j2 00 j1 00 j0
  psubsw_m2r(mmx_128w,mm3);  // mm3= 00j3 00j2 00j1 00j0 (valeurs en signed short centrées en 0)

  //multiplication avec recuperation des 16 bits de poids faible
  // cela suffit car on fait le produit de 2 nombres compris entre -127 et 127
  pmullw_r2r(mm0,mm2);      // mm2= i7*j7 i6*j6 i5*j5 i4*j4 (octets de poids faible seulement)
  movq_r2m(mm2,*(Out+4));   // Out est un (signed short *) donc +4 donne 8 octets de decalage

  //on fait pareil avec la deuxieme paire de registres
  pmullw_r2r(mm1,mm3);
  movq_r2m(mm3,*(Out));

}

static inline void MmmxSignedCharProduct(const signed char* In1, const signed char* In2,
				   signed short* Out, int size)
{
  signed short* PtOut;
  signed short* fin=Out+size;
  MmmxSignedCharProduct0();
  for (PtOut=Out; PtOut<fin ; PtOut+=8)
    {
      MmmxSignedCharProduct1(In1,In2,PtOut);
      In1+=8;
      In2+=8;
    }
}

//Version SSE2
static inline void Msse2_SignedCharProduct0(signed short* sse127)
{
  static sse_t sse_0000=  {0x00000000000000000000000000000000ull};
  movdqa_m2r(sse_0000,xmm6);
  movdqa_m2r(*sse127,xmm7);
}

static inline void Msse2_SignedCharProduct1(const signed char* In1, const signed char* In2,
				   signed short* Out)
{

  //premier paquet de 16 octets de la MCharImage
  //Conversion des char en signed short
  //et soustraction de 127
  movdqa_m2r(*In1,xmm0);       // xmm0= i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm1);
  punpckhbw_r2r(xmm6,xmm0);    // xmm0= 00 i15 00 i14 00 i13 00 i12 00 i11 00 i10 00 i9 00 i8
  psubsw_r2r(xmm7,xmm0);   		 // xmm0= 00i15 00i14 00i13 00i12 00i11 00i10 00i9 00i8 (valeurs en signed short centrées en 0)
  punpcklbw_r2r(xmm6,xmm1);    // xmm1= 00 i7 00 i6 00 i5 00 i4 00 i3 00 i2 00 i1 00 i0
  psubsw_r2r(xmm7,xmm1);   		 // xmm1= 00i7 00i6 00i5 00i4 00i3 00i2 00i1 00i0 (valeurs en signed short centrées en 0)

  //deuxieme paquet de 16 octets
  movdqa_m2r(*In2,xmm2);
  movdqa_r2r(xmm2,xmm3);
  punpckhbw_r2r(xmm6,xmm2);
  psubsw_r2r(xmm7,xmm2);       // xmm2= 00j15 00j14 00j13 00j12 00j11 00j10 00j9 00j8 (valeurs en signed short centrées en 0)
  punpcklbw_r2r(xmm6,xmm3);
  psubsw_r2r(xmm7,xmm3);       // xmm3= 00j7 00j6 00j5 00j4 00j3 00j2 00j1 00j0 (valeurs en signed short centrées en 0)

  //multiplication avec recuperation des 16 bits de poids faible
  // cela suffit car on fait le produit de 2 nombres compris entre -127 et 127
  pmullw_r2r(xmm0,xmm2);      // xmm2= i15*j15 i16*j16 .... (octets de poids faible seulement)
  movdqa_r2m(xmm2,*(Out+8));   // Out est un (signed short *) donc +8 donne 16 octets de decalage

  //on fait pareil avec la deuxieme paire de registres
  pmullw_r2r(xmm1,xmm3);
  movdqa_r2m(xmm3,*(Out));  
}

static inline void Msse2_SignedCharProduct(const signed char* In1, const signed char* In2,
				   signed short* Out, int size)
{
	signed short* sse127=(signed short*) MAlign16ByteNew(8); for (int i=0;i<16;i++) sse127[i]=127;
  signed short* PtOut;
  signed short* fin=Out+size;
  Msse2_SignedCharProduct0(sse127);
  for (PtOut=Out; PtOut<fin ; PtOut+=16)
  {
    Msse2_SignedCharProduct1(In1,In2,PtOut);
    In1+=16;
    In2+=16;
  }
  MAlign16ByteDelete((int *)(sse127));  
}

#else 
static inline void MmmxSignedCharProduct(const signed char* In1, const signed char* In2,
				   signed short* Out, int size) {}
static inline void Msse2_SignedCharProduct(const signed char* In1, const signed char* In2,
				   signed short* Out, int size) {}
#endif

void MSignedShortImage::CharProduct(const MCharImage& Image1,const MCharImage& Image2)
{
  assert( (Image1.x()==Image2.x()) && (Image1.y()==Image2.y()));

  SetSize(Image1.x(), Image1.y());

  if (MWithMmxSseUse()==2)
  {
    Msse2_SignedCharProduct((const signed char*)Image1.GetPtr(),
			    (const signed char*)Image2.GetPtr(), Map, x__*y__);
  }
  else
  {
		if (MWithMmxSseUse()==1)
		{
			MmmxSignedCharProduct((const signed char*)Image1.GetPtr(),
					      (const signed char*)Image2.GetPtr(), Map, x__*y__);
			Memms();
		}
		else
		{
			signed short *Out= Map;
			const signed char* In1= (signed char*) (Image1.GetPtr());
			const signed char* In2= (signed char*) (Image2.GetPtr());
			for (int S= x__*y__; S>-1; S--)
			{
				signed short a=((signed short) ((unsigned char)(In1[S])))-127;
				signed short b=((signed short) ((unsigned char)(In2[S])))-127;
				Out[S]= a * b;
			}
		}
	}
}


//----------------------------------------------------------------------------------------------------
//Filtre moyenneur 5x5 applique sur une MCharImage (non signee) et conversion en signed short dans la foulee
//----------------------------------------------------------------------------------------------------
#ifdef ARCH_X86

#else
#endif
void MSignedShortImage::Somme5XChar2Short(MCharImage& Data)
{
 	for (int y=0;y<y__;y++)
	{
		unsigned char* PteIn=&(Data.y(0,y));
    signed short* PteOut=&(Map[2+y*x__]);
		for (int x=2;x<x__-2;x++)
		{
			*PteOut=short(PteIn[0])+short(PteIn[1])+short(PteIn[2])+short(PteIn[3])+short(PteIn[4]);
      PteOut++;
      PteIn++;
		}
	}	
}
void MSignedShortImage::Somme5Y(MSignedShortImage& Data)
{

 	for (int y=2;y<y__-2;y++)
	{
		signed short* PteIn=&(Data.Map[(y-2)*x__]);
    signed short* PteOut=&(Map[y*x__]);
		for (int x=0;x<x__;x++)
		{
			*PteOut=short(PteIn[0])+short(PteIn[x__])+short(PteIn[2*x__])+short(PteIn[3*x__])+short(PteIn[4*x__]);
      PteOut++;
      PteIn++;
		}
	}

}

void MSignedShortImage::Divide(short denom)
{
 	for (int y=0;y<y__;y++)
	{
    signed short* PteOut=&(Map[y*x__]);
		for (int x=0;x<x__;x++)
		{
			*PteOut=*PteOut/denom;
      PteOut++;
		}
	}
}

void MSignedShortImage::Moyenne5x5Char2Short(MCharImage& Data)
{
	MSignedShortImage ImgSommeX(Data.x(),Data.y());
	ImgSommeX.Somme5XChar2Short(Data);
	this->Somme5Y(ImgSommeX);
  this->Divide(25);
}


