// ****************************************************************************
// ******************************* MFloatImage ********************************
// ****************************************************************************

#include "MFloatImage.h"

inline void MFloatImage::SetSize(int x_, int y_) {
  assert((x_&7)== 0 && (y_&1)== 0); // like MCharImage for conversions
  // assert((x_&3)== 0);
  if (x_*y_> x__*y__) {
    if (x__)
      MAlign16ByteDelete((int*)Map); // delete yuv;
    Map= (float*)MAlign16ByteNew(x_*y_);
  };
  x__= x_;
  y__= y_;
  xMin_=     yMin_= 0;
  xMax_= x_; yMax_= y_;
}

MFloatImage::MFloatImage(int x_, int y_, int WithInit0):
                                                   x__(0), y__(0), Map(NULL)  {
  SetSize(x_,y_);
  if (WithInit0 && x__ && y__)
    for (int S= x__*y__; S; Map[--S]= 0);
}
MFloatImage::~MFloatImage() {
  if (x__)
    MAlign16ByteDelete((int*)Map);
}


inline void MFloatImage::Bound(int&xMin, int&xMax, int&yMin, int&yMax) const {
  xMin= xMin_; xMax= xMax_; yMin= yMin_; yMax= yMax_;
}

ostream& operator<<(ostream& Out, const MFloatImage& Image) {
  int xMin, xMax, yMin, yMax;
  Image.Bound(xMin, xMax, yMin, yMax);
  Out << "MFloatImage " << Image.x__ << "x" << Image.y__
      <<"  (" <<xMin <<"x" <<yMin <<"  " <<xMax <<"x" <<yMax <<")" <<endl;
  for (int yy= 0; yy< Image.y__; yy++) {
    for (int xx= 0; xx< Image.x__; xx++)
      Out << Image.M_(xx,yy) << " ";
    Out << endl << endl;
  };
  Out << endl;
  return Out;
}

//------------------------------------------------------------------------------------
// ConvertFrom(MCharImage&)
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MsseConvertCharToFloat(const uint8_t* In, float* Out) {
  movq_m2r      (*In,mm0);     // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r      (mm0,mm1);     // mm1= mm0
  pxor_r2r      (mm3,mm3);     // mm3= 0

  punpcklbw_r2r (mm3,mm0);     // mm0= 00 i3 00 i2 00 i1 00 i0
  movq_r2r      (mm0,mm2);     // mm2= mm0
  punpcklwd_r2r (mm3,mm0);     // mm0= 00 00 00 i1 00 00 00 i0
  cvtpi2ps_r2r  (mm0,xmm0);    // xmm0= 0 0 float4(i1) float4(i0)
  punpckhwd_r2r (mm3,mm2);     // mm2= 00 00 00 i3 00 00 00 i2
  cvtpi2ps_r2r  (mm2,xmm1);    // xmm1= 0 0 float4(i3) float4(i2)
  shufps_r2r    (xmm1,xmm0,0x44); // xmm0=float(i3)float(i2)float(i1)float(i0)
  movntps_r2m   (xmm0,*Out);      // no cache polution and 16 bytes alignment
  // movups_r2m    (xmm0, *Out);  // store xmm0 allowing non 16-byte alignment

  punpckhbw_r2r (mm3,mm1);     // mm1= 00 i7 00 i6 00 i5 00 i4
  movq_r2r      (mm1,mm2);     // mm2= mm1
  punpcklwd_r2r (mm3,mm1);     // mm1= 00 00 00 i5 00 00 00 i4
  cvtpi2ps_r2r  (mm1,xmm0);    // xmm0= 0 0 float4(i5) float4(i4)
  punpckhwd_r2r (mm3,mm2);     // mm2= 00 00 00 i7 00 00 00 i6
  cvtpi2ps_r2r  (mm2,xmm1);    // xmm1= 0 0 float4(i7) float4(i6)
  shufps_r2r    (xmm1,xmm0,0x44);  // xmm0=float(i7)float(i6)float(i5)float(i4)
  movntps_r2m   (xmm0,*(Out+4));   // no cache polution and 16 bytes alignment
//movups_r2m    (xmm0, *(Out+4));  // store xmm0 allowing non 16-byte alignment
}
static inline void MsseConvertCharToFloat(const uint8_t* In, int Length,
					  float* Out) {
  for (int i= Length/8; i; i--) {
    MsseConvertCharToFloat(In, Out);
    Out+= 8;
    In+= 8;
  };
}
#else
static inline void MsseConvertCharToFloat(const uint8_t* In, int Length,
					  float* Out) {}
#endif

void MFloatImage::ConvertFrom(const MCharImage& Image) {
  SetSize(Image.x(), Image.y());
  if (MWithMmxSseUse()) {
    MsseConvertCharToFloat(&Image.y_(0,0), x__*y__, Map);
    Memms(); // now FPU can be used
  } else {
    float Lookup[256], *Out= Map-1;
    for (int i= 0; i< 256; i++)
      Lookup[i]= float(i); // conversion is very slow, so use lookup table
    const unsigned char* In= &Image.y_(0,0)-1;
    for (int S= x__*y__; S; S--)
      Out[S]= Lookup[In[S]];
  };
}

//------------------------------------------------------------------------------------
// ConvertFrom(MSignedShortImage&)
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MsseConvertSignedShortToFloat1(const signed short* In, float* Out)
{
  movq_m2r      (*In,mm0);     // mm0= iii3 iii2 iii1 iii0
  movq_r2r      (mm0,mm1);     // mm1= mm0

  punpcklwd_r2r(mm0,mm0);     // mm0= iii1xxxx iii0xxxx
  psrad_i2r(16,mm0);          // extension du signe
  cvtpi2ps_r2r(mm0,xmm0);     // xmm0= 0 0 float4(i1) float4(i0)

  punpckhwd_r2r(mm1,mm1);     // mm1= iii3xxxx iii2xxxx
  psrad_i2r(16,mm1);          // extension du signe
  cvtpi2ps_r2r(mm1,xmm1);     // xmm1= 0 0 float4(i3) float4(i2)

  shufps_r2r    (xmm1,xmm0,0x44); // xmm0=float(i3)float(i2)float(i1)float(i0)
  movaps_r2m   (xmm0,*Out);        
}
#else
static inline void MsseConvertSignedShortToFloat1(const signed short* In, float* Out){}
#endif
void MFloatImage::ConvertFrom(const MSignedShortImage& Image)
{
  SetSize(Image.x(), Image.y());
  if (MWithMmxSseUse())
  {
    const signed short* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i+=4)
    {
      MsseConvertSignedShortToFloat1(In,Out);
      In+=4;
      Out+=4;
    }
    Memms(); // now FPU can be used
  }
  else
  {
    const signed short* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i++)
    {
      *Out= float(*In);
      In++;
      Out++;
    }
  };
}

//------------------------------------------------------------------------------------
// Product
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MsseProduct(const float* In, float* Out)
{
	movaps_m2r(*In,xmm0);
	movaps_m2r(*Out,xmm1);
	mulps_r2r(xmm0,xmm1);
	movaps_r2m(xmm1,*Out);
}
#else
static inline void MsseProduct(const float* In, float* Out){}
#endif
void MFloatImage::Product(const MFloatImage& Image)
{
  if (MWithMmxSseUse())
  {
    const float* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i+=4)
    {
      MsseProduct(In,Out);
      In+=4;
      Out+=4;
    }
  }
  else
  {
    const float* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i++)
    {
      *Out= (*Out)*(*In);
      In++;
      Out++;
    }
  };
}
//------------------------------------------------------------------------------------
// Divide
//------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MsseDivide(const float* In, float* Out)
{
	movaps_m2r(*In,xmm0);
	movaps_m2r(*Out,xmm1);
	divps_r2r(xmm0,xmm1);
	movaps_r2m(xmm1,*Out);
}
#else
static inline void MsseProduct(const float* In, float* Out){}
#endif
void MFloatImage::Divide(const MFloatImage& Image)
{
  if (MWithMmxSseUse())
  {
    const float* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i+=4)
    {
      MsseDivide(In,Out);
      In+=4;
      Out+=4;
    }
  }
  else
  {
    const float* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i++)
    {
      *Out= (*Out)/(*In);
      In++;
      Out++;
    }
  };
}
//---------------------------------------------------------------------------------------------------
// SqrtImg
//---------------------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MsseSqrtImg(const float* In, float* Out)
{
	movaps_m2r(*In,xmm0);
	sqrtps_r2r(xmm0,xmm0);
	movaps_r2m(xmm0,*Out);
}
#else
static inline void MsseSqrtImg(const float* In, float* Out){}
#endif
void MFloatImage::SqrtImg(const MFloatImage& Image)
{
  SetSize(Image.x(), Image.y());
  if (MWithMmxSseUse())
  {
    const float* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i+=4)
    {
      MsseSqrtImg(In,Out);
      In+=4;
      Out+=4;
    }
  }
  else
  {
    const float* In= &(Image.M_(0,0));
    float* Out= Map;
    int fin=x__*y__;
    for (int i=0;i<fin; i++)
    {
      *Out= sqrt(*In);
      In++;
      Out++;
    }
  };
}

//------------------------------------------------------------------------------------
// operator=
//------------------------------------------------------------------------------------
const MFloatImage& MFloatImage::operator=(const MFloatImage& Image) {
  SetSize(Image.x__,Image.y__);
  xMin_= Image.xMin_; yMin_= Image.yMin_;
  xMax_= Image.xMax_; yMax_= Image.yMax_;
  memcpy((char*)Map, (char*)Image.Map, x__*y__*4);
  return *this;
}

// ******************************* 1 2 1 smooth *******************************

#ifdef ARCH_X86
static inline void Msse121ySmooth_1(const float* In, int xStride) {
  static sse_t sse_25 = { 0.25, 0.25, 0.25, 0.25 };
  movaps_m2r (sse_25,xmm0);
  movaps_m2r (*In,xmm1);
  movaps_m2r (*(In+xStride),xmm2);
}
static inline void Msse121ySmooth_2(const float* In, float* Out) {
  movaps_m2r (*In,xmm3);
  addps_r2r  (xmm2,xmm1);
  addps_r2r  (xmm2,xmm1);
  addps_r2r  (xmm3,xmm1);
  mulps_r2r  (xmm0,xmm1);
  movntps_r2m (xmm1,*Out);

  movaps_r2r (xmm2,xmm1);
  movaps_r2r (xmm3,xmm2);
}
static inline void Msse121ySmooth(const float* In, int xStrideIn, int x, int y,
				  float* Out, int xStrideOut) {
  for (int xx= x/4; xx; xx--) {
    const float* PteIn= In+4*(xx-1);
    Msse121ySmooth_1(PteIn, xStrideIn);
    PteIn+= (xStrideIn+xStrideIn);
    float* PteOut= Out+4*(xx-1)+xStrideOut;
    for (int yy= y-2; yy; yy--) {
      Msse121ySmooth_2(PteIn, PteOut);
      PteIn+=  xStrideIn;
      PteOut+= xStrideOut;
    };
  };
}


static inline void Msse121xSmooth_1(const float* In) {
  static sse_t sse_25= { 0.25, 0.25, 0.25, 0.25 };
  movaps_m2r (sse_mask000f,xmm0);
  movaps_m2r (sse_maskfff0,xmm1);
  movaps_m2r (sse_25,xmm6);
  xorps_r2r  (xmm2,xmm2); // xmm2= 0 0 0 0
  movaps_m2r (*In,xmm3);
}
static inline void Msse121xSmooth_2(const float* In, float* Out) {
  // xmm2= i3 i2 i1 i0, xmm3= j3 j2 j1 j0
  shufps_r2r (xmm2,xmm2,0x93);
  andps_r2r  (xmm0,xmm2);      // xmm0= 000f , xmm2= 0 0 0 i3
  addps_r2r  (xmm3,xmm2);
  addps_r2r  (xmm3,xmm2);      // xmm2= 2*j3 2*j2 2*j1 i3+2*j0

  movaps_r2r (xmm3,xmm4);
  shufps_r2r (xmm4,xmm4,0x93); // xmm4= j2 j1 j0 j3
  andps_r2r  (xmm1,xmm4);      // xmm1= fff0 , xmm4= j2 j1 j0 0
  addps_r2r  (xmm4,xmm2);      // xmm2= j2+2*j3 j1+2*j2 j0+2*j1 i3+2*j0

  movaps_r2r (xmm3,xmm4);
  andps_r2r  (xmm1,xmm4);      // xmm4= j3 j2 j1 0
  shufps_r2r (xmm4,xmm4,0x39); // xmm4= 0 j3 j2 j1
  addps_r2r  (xmm4,xmm2);      // xmm2=j2+2*j3 j1+2*j2+j3 j0+2*j1+j2 i3+2*j0+j1

  movaps_m2r (*In,xmm4);       // xmm4= k3 k2 k1 k0
  movaps_r2r (xmm4,xmm5);
  andps_r2r  (xmm0,xmm5);      // xmm5= 0 0 0 k0
  shufps_r2r (xmm5,xmm5,0x39); // xmm5= k0 0 0 0
  addps_r2r  (xmm5,xmm2);   // xmm2=j2+2*j3+k0 j1+2*j2+j3 j0+2*j1+j2 i3+2*j0+j1

  mulps_r2r  (xmm6,xmm2);
  movntps_r2m (xmm2,*Out);
  movaps_r2r (xmm3,xmm2);
  movaps_r2r (xmm4,xmm3);
}
static inline void Msse121xSmooth(const float* In, int xStrideIn, int x, int y,
				  float* Out, int xStrideOut) {
  *(int*)&sse_mask000f.sf[0]= -1; *(int*)&sse_mask000f.sf[1]= 0;
  *(int*)&sse_mask000f.sf[2]= 0;  *(int*)&sse_mask000f.sf[3]= 0;
  *(int*)&sse_maskfff0.sf[0]= 0;  *(int*)&sse_maskfff0.sf[1]= -1;
  *(int*)&sse_maskfff0.sf[2]= -1; *(int*)&sse_maskfff0.sf[3]= -1;
  for (int yy= y; yy; yy--) {
    const float* PteIn= In+(yy-1)*xStrideIn;
    Msse121xSmooth_1(PteIn);
    PteIn+= 4;
    float* PteOut= Out+(yy-1)*xStrideOut;
    for (int xx= x/4; xx; xx--) {
      Msse121xSmooth_2(PteIn, PteOut);
      PteIn+= 4;
      PteOut+= 4;
    };
  };
}
#else
static inline void Msse121xSmooth(const float* In, int xStrideIn, int x, int y,
				  float* Out, int xStrideOut) {}
static inline void Msse121ySmooth(const float* In, int xStrideIn, int x, int y,
				  float* Out, int xStrideOut) {}
#endif

void MFloatImage::Smooth121(int IsHorizontal0Vertical1,
			    const MFloatImage& Data) {
  SetSize(Data.x__, Data.y__);
  if (IsHorizontal0Vertical1) {
    xMin_= Data.xMin_;   xMax_= Data.xMax_;
    yMin_= Data.yMin_+1; yMax_= Data.yMax_-1;
    if (MWithMmxSseUse())
      Msse121ySmooth(Data.Map, x__, x__, y__, Map, x__);
    else
      for (int xx= 0; xx< x__; xx++) {
	float *PteIn= Data.Map+xx, *PteOut= Map+xx+x__,
	  a= PteIn[0], b= PteIn[x__];
	PteIn+= (x__+x__);
	for (int yy= y__-2; yy; yy--) {
	  float c= *PteIn;
	  *PteOut= .25*(a+c+b+b);
	  PteIn+=  x__;
	  PteOut+= x__;
	  a= b; b= c;
	};
      };
  } else {
    xMin_= Data.xMin_+1; xMax_= Data.xMax_-1;
    yMin_= Data.yMin_;   yMax_= Data.yMax_;
    if (MWithMmxSseUse())
      Msse121xSmooth(Data.Map, x__, x__, y__, Map, x__);
    else
      for (int yy= 0; yy< y__; yy++) {
	float *PteIn= Data.Map+yy*x__+1, *PteOut= Map+yy*x__,
	  a= 0, b= PteIn[-1];
	for (int xx= 0; xx< x__; xx++) {
	  float c= PteIn[xx];
	  PteOut[xx]= .25*(a+c+b+b);
	  a= b; b= c;
	};
      };
  };
}

// ****************************** -1 0 1 gradient *****************************

#ifdef ARCH_X86
static inline void Msse121yGradient_1(const float* In, int xStride) {
  movaps_m2r (*In,xmm1);
  movaps_m2r (*(In+xStride),xmm2);
}
static inline void Msse121yGradient_2(const float* In, float* Out) {
  movaps_m2r (*In,xmm3);
  movaps_r2r (xmm3,xmm4);
  subps_r2r  (xmm1,xmm4);

  movntps_r2m (xmm4,*Out);
  movaps_r2r (xmm2,xmm1);
  movaps_r2r (xmm3,xmm2);
}
static inline void Msse121yGradient(const float* In, int xStrideIn,
				    int x, int y, float* Out, int xStrideOut) {
  for (int xx= x/4; xx; xx--) {
    const float* PteIn= In+4*(xx-1);
    Msse121yGradient_1(PteIn, xStrideIn);
    PteIn+= 2*xStrideIn;
    float* PteOut= Out+4*(xx-1)+xStrideOut;
    for (int yy= y-2; yy; yy--) {
      Msse121yGradient_2(PteIn, PteOut);
      PteIn+=  xStrideIn;
      PteOut+= xStrideOut;
    };
  };
}

static inline void Msse121xGradient_1(const float* In) {
  movaps_m2r (sse_mask000f,xmm0);
  movaps_m2r (sse_maskfff0,xmm1);
  xorps_r2r  (xmm2,xmm2); // xmm2= 0 0 0 0
  movaps_m2r (*In,xmm3);
}
static inline void Msse121xGradient_2(const float* In, float* Out) {
  // xmm2= i3 i2 i1 i0, xmm3= j3 j2 j1 j0
  shufps_r2r (xmm2,xmm2,0x93); // xmm2= i2 i1 i0 i3
  xorps_r2r  (xmm4,xmm4);
  subss_r2r  (xmm2,xmm4);
  movaps_r2r (xmm4,xmm2);      // xmm2= 0 0 0 -i3

  movaps_r2r (xmm3,xmm4);
  shufps_r2r (xmm4,xmm4,0x93); // xmm4= j2 j1 j0 j3
  andps_r2r  (xmm1,xmm4);      // xmm1= fff0 , xmm4= j2 j1 j0 0
  subps_r2r  (xmm4,xmm2);      // xmm2= -j2 -j1 -j0 -i3

  movaps_r2r (xmm3,xmm4);
  andps_r2r  (xmm1,xmm4);      // xmm4= j3 j2 j1 0
  shufps_r2r (xmm4,xmm4,0x39); // xmm4= 0 j3 j2 j1
  addps_r2r  (xmm4,xmm2);      // xmm2= -j2 j3-j2 j2-j0 j1-i3

  movaps_m2r (*In,xmm4);       // xmm4= k3 k2 k1 k0
  movaps_r2r (xmm4,xmm5);
  andps_r2r  (xmm0,xmm5);      // xmm5= 0 0 0 k0
  shufps_r2r (xmm5,xmm5,0x39); // xmm5= k0 0 0 0
  addps_r2r  (xmm5,xmm2);      // xmm2= k0-j2 j3-j2 j2-j0 j1-i3

  movntps_r2m (xmm2,*Out);
  movaps_r2r (xmm3,xmm2);
  movaps_r2r (xmm4,xmm3);
}

static inline void Msse121xGradient(const float* In, int xStrideIn,
				    int x, int y, float* Out, int xStrideOut) {
  *(int*)&sse_mask000f.sf[0]= -1; *(int*)&sse_mask000f.sf[1]= 0;
  *(int*)&sse_mask000f.sf[2]= 0;  *(int*)&sse_mask000f.sf[3]= 0;
  *(int*)&sse_maskfff0.sf[0]= 0;  *(int*)&sse_maskfff0.sf[1]= -1;
  *(int*)&sse_maskfff0.sf[2]= -1; *(int*)&sse_maskfff0.sf[3]= -1;
  for (int yy= y; yy; yy--) {
    const float* PteIn= In+(yy-1)*xStrideIn;
    Msse121xGradient_1(PteIn);
    PteIn+= 4;
    float* PteOut= Out+(yy-1)*xStrideOut;
    for (int xx= x/4; xx; xx--) {
      Msse121xGradient_2(PteIn, PteOut);
      PteIn+= 4;
      PteOut+= 4;
    };
  };
}
#else
static inline void Msse121xGradient(const float* In, int xStrideIn,
				   int x, int y, float* Out, int xStrideOut) {}
static inline void Msse121yGradient(const float* In, int xStrideIn,
				   int x, int y, float* Out, int xStrideOut) {}
#endif

void MFloatImage::Gradient_101(int IsHorizontal0Vertical1,
			       const MFloatImage& Data) {
  SetSize(Data.x__, Data.y__);
  if (IsHorizontal0Vertical1) {
    xMin_= Data.xMin_;   xMax_= Data.xMax_;
    yMin_= Data.yMin_+1; yMax_= Data.yMax_-1;
    if (MWithMmxSseUse())
      Msse121yGradient(Data.Map, x__, x__, y__, Map, x__);
    else
      for (int xx= 0; xx< x__; xx++) {
	float *PteIn= Data.Map+xx, *PteOut= Map+xx+x__,
	  a= PteIn[0], b= PteIn[x__];
	PteIn+= (x__+x__);
	for (int yy= y__-2; yy; yy--) {
	  float c= *PteIn;
	  *PteOut= c-a;
	  PteIn+=  x__;
	  PteOut+= x__;
	  a= b; b= c;
	};
      };
  } else {
    xMin_= Data.xMin_+1; xMax_= Data.xMax_-1;
    yMin_= Data.yMin_;   yMax_= Data.yMax_;
    if (MWithMmxSseUse())
      Msse121xGradient(Data.Map, x__, x__, y__, Map, x__);
    else
      for (int yy= 0; yy< y__; yy++) {
	float *PteIn= Data.Map+yy*x__+1, *PteOut= Map+yy*x__,
	  a= 0, b= PteIn[-1];
	for (int xx= 0; xx< x__; xx++) {
	  float c= PteIn[xx];
	  PteOut[xx]= c-a;
	  a= b; b= c;
	};
      };
  };
}

// ********************************** abs *************************************

#ifdef ARCH_X86
static inline void MsseAbs(float* InOut) {
  movaps_m2r (*InOut,xmm0);
  xorps_r2r  (xmm1,xmm1);
  subps_r2r  (xmm0,xmm1);
  maxps_r2r  (xmm1,xmm0);
  movntps_r2m (xmm0,*InOut);
}
static inline void MsseAbs(float* InOut, int Length) {
  for (int i= Length/4; i; i--) {
    MsseAbs(InOut);
    InOut+= 4;
  };
}
#else
static inline void MsseAbs(float* InOut, int Length) {}
#endif

void MFloatImage::AbsMap() {
  if (MWithMmxSseUse())
    MsseAbs(Map, x__*y__);
  else {
    float *Pte= Map-1;
    for (int i= x__*y__; i; i--)
      Pte[i]= Abs(Pte[i]);
  };
}

// ********************************* norm *************************************

#ifdef ARCH_X86
static inline void MsseNorm(const float* xIn, const float* yIn, float* Out) {
  movaps_m2r (*xIn,xmm0);
  mulps_r2r  (xmm0,xmm0);
  movaps_m2r (*yIn,xmm1);
  mulps_r2r  (xmm1,xmm1);
  addps_r2r  (xmm1,xmm0);
  sqrtps_r2r (xmm0,xmm0);
  movntps_r2m (xmm0,*Out);
}
static inline void MsseNorm(const float* xIn, const float* yIn,
			    int Length, float* Out) {
  for (int i= Length/4; i; i--) {
    MsseNorm(xIn, yIn, Out);
    xIn+= 4; yIn+= 4; Out+= 4;
  };
}
#else
static inline void MsseNorm(const float* xIn, const float* yIn,
			    int Length, float* Out) {}
#endif

void MFloatImage::Norm(const MFloatImage& xData, const MFloatImage& yData) {
  assert(xData.x__== yData.x__ && xData.y__== yData.y__);
  SetSize(xData.x__, xData.y__);
  xMin_= Max(xData.xMin_, yData.xMin_);
  yMin_= Max(xData.yMin_, yData.yMin_);
  xMax_= Min(xData.xMax_, yData.xMax_);
  yMax_= Min(xData.yMax_, yData.yMax_);
  if (MWithMmxSseUse())
    MsseNorm(xData.Map, yData.Map, x__*y__, Map);
  else {
    const float *xPte= xData.Map-1, *yPte= yData.Map-1;
    float *Pte= Map-1;
    for (int i= x__*y__; i; i--)
      Pte[i]= sqrt(xPte[i]*xPte[i]+yPte[i]*yPte[i]);
  };
}

// ****************************** 1 -4 1 laplacian ****************************

#ifdef ARCH_X86
//static sse_t sse_mask000f, sse_maskfff0, sse_mask0ff0;

static inline void Msse121xLaplacian_1(const float* In) {
  movaps_m2r (sse_mask000f,xmm0);
  movaps_m2r (sse_maskfff0,xmm1);
  xorps_r2r  (xmm2,xmm2); // xmm2= 0 0 0 0
  movaps_m2r (*In,xmm3);
}
static inline void Msse121xLaplacian_2(const float* In, float* Out) {
  // xmm2= i3 i2 i1 i0, xmm3= j3 j2 j1 j0
  shufps_r2r (xmm2,xmm2,0x93);
  andps_r2r  (xmm0,xmm2);      // xmm0= 000f , xmm2= 0 0 0 i3
  subps_r2r  (xmm3,xmm2);
  subps_r2r  (xmm3,xmm2);
  subps_r2r  (xmm3,xmm2);
  subps_r2r  (xmm3,xmm2);      // xmm2= -4*j3 -4*j2 -4*j1 i3-4*j0

  movaps_r2r (xmm3,xmm4);
  shufps_r2r (xmm4,xmm4,0x93); // xmm4= j2 j1 j0 j3
  andps_r2r  (xmm1,xmm4);      // xmm1= fff0 , xmm4= j2 j1 j0 0
  addps_r2r  (xmm4,xmm2);      // xmm2= j2-4*j3 j1-4*j2 j0-4*j1 i3-4*j0

  movaps_r2r (xmm3,xmm4);
  andps_r2r  (xmm1,xmm4);      // xmm4= j3 j2 j1 0
  shufps_r2r (xmm4,xmm4,0x39); // xmm4= 0 j3 j2 j1
  addps_r2r  (xmm4,xmm2);      // xmm2=j2-4*j3 j1-4*j2+j3 j0-4*j1+j2 i3-4*j0+j1

  movaps_m2r (*In,xmm4);       // xmm4= k3 k2 k1 k0
  movaps_r2r (xmm4,xmm5);
  andps_r2r  (xmm0,xmm5);      // xmm5= 0 0 0 k0
  shufps_r2r (xmm5,xmm5,0x39); // xmm5= k0 0 0 0
  addps_r2r  (xmm5,xmm2);   // xmm2=j2-4*j3+k0 j1-4*j2+j3 j0-4*j1+j2 i3-4*j0+j1

  movntps_r2m (xmm2,*Out);
  movaps_r2r (xmm3,xmm2);
  movaps_r2r (xmm4,xmm3);
}
static inline void Msse121xLaplacian(const float* In, int xStrideIn,
				     int x,int y, float* Out, int xStrideOut) {
  *(int*)&sse_mask000f.sf[0]= -1; *(int*)&sse_mask000f.sf[1]= 0;
  *(int*)&sse_mask000f.sf[2]= 0;  *(int*)&sse_mask000f.sf[3]= 0;
  *(int*)&sse_maskfff0.sf[0]= 0;  *(int*)&sse_maskfff0.sf[1]= -1;
  *(int*)&sse_maskfff0.sf[2]= -1; *(int*)&sse_maskfff0.sf[3]= -1;
  for (int yy= y; yy; yy--) {
    const float* PteIn= In+(yy-1)*xStrideIn;
    Msse121xLaplacian_1(PteIn);
    PteIn+= 4;
    float* PteOut= Out+(yy-1)*xStrideOut;
    for (int xx= x/4; xx; xx--) {
      Msse121xLaplacian_2(PteIn, PteOut);
      PteIn+= 4;
      PteOut+= 4;
    };
  };
}

static inline void Msse121yLaplacian_1(const float* In, int xStride) {
  movaps_m2r (*In,xmm1);
  movaps_m2r (*(In+xStride),xmm2);
}
static inline void Msse121yLaplacian_2a(const float* In, float* Out) {
  // xmm1= x4 x3 x2 x1, xmm2= y4 y3 y2 y1
  movaps_m2r (*In,xmm3); // xmm3= z4 z3 z2 z1
  addps_m2r  (*Out,xmm1);
  // xmm1= x4+(y5-4*y4+y3) x3+(y4-4*y3+y2) x2+(y3-4*y2+y1) x1+(y2-4*y1+y0)
  addps_r2r  (xmm3,xmm1);
 // x4+(y5-4*y4+y3)+z4 x3+(y4-4*y3+y2)+z3 x2+(y3-4*y2+y1)+z2 x1+(y2-4*y1+y0)+z1
  xorps_r2r  (xmm4,xmm4);
  subps_r2r  (xmm1,xmm4);
  maxps_r2r  (xmm4,xmm1);
  // xmm1= |xmm1(3)| |xmm1(2)| |xmm1(1)| |xmm1(0)|
  movntps_r2m (xmm1,*Out);

  movaps_r2r (xmm2,xmm1);
  movaps_r2r (xmm3,xmm2);
}
static inline void Msse121yLaplacian_2b(const float* In, float* Out) {
  // xmm1= x4 x3 x2 x1, xmm2= y4 y3 y2 y1
  movaps_m2r (*In,xmm3); // xmm3= z4 z3 z2 z1
  addps_m2r  (*Out,xmm1);
  // xmm1= x4+(y5-4*y4+y3) x3+(y4-4*y3+y2) x2+(y3-4*y2+y1) x1+(y2-4*y1+y0)
  addps_r2r  (xmm3,xmm1);
 // x4+(y5-4*y4+y3)+z4 x3+(y4-4*y3+y2)+z3 x2+(y3-4*y2+y1)+z2 x1+(y2-4*y1+y0)+z1
  movntps_r2m (xmm1,*Out);

  movaps_r2r (xmm2,xmm1);
  movaps_r2r (xmm3,xmm2);
}
static inline void Msse121yLaplacian(const float* In, int xStrideIn,
			int x,int y, float* Out, int xStrideOut, int WithAbs) {
  if (WithAbs) {
    for (int xx= x/4; xx; xx--) {
      const float *PteIn= In+4*(xx-1);
      Msse121yLaplacian_1(PteIn, xStrideIn);
      PteIn+= 2*xStrideIn;
      float* PteOut= Out+4*(xx-1)+xStrideOut;
      for (int yy= y-2; yy; yy--) {
	Msse121yLaplacian_2a(PteIn, PteOut);
	PteIn+=  xStrideIn;
	PteOut+= xStrideOut;
      };
    };
  } else {
    for (int xx= x/4; xx; xx--) {
      const float *PteIn= In+4*(xx-1);
      Msse121yLaplacian_1(PteIn, xStrideIn);
      PteIn+= 2*xStrideIn;
      float* PteOut= Out+4*(xx-1)+xStrideOut;
      for (int yy= y-2; yy; yy--) {
	Msse121yLaplacian_2b(PteIn, PteOut);
	PteIn+=  xStrideIn;
	PteOut+= xStrideOut;
      };
    };
  };
}
#else
static inline void Msse121xLaplacian(const float* In, int xStrideIn,
				   int x, int y, float* Out, int xStrideOut) {}
static inline void Msse121yLaplacian(const float* In, int xStrideIn,
		       int x,int y, float* Out, int xStrideOut, int WithAbs) {}
#endif

void MFloatImage::Laplacian1_41(const MFloatImage& Data, int WithAbs) {
  assert(this!= &Data);
  SetSize(Data.x__, Data.y__);
  xMin_++; xMax_--;
  yMin_++; yMax_--;
  if (MWithMmxSseUse()) {
    Msse121xLaplacian(Data.Map, x__, x__, y__, Map, x__);
    Msse121yLaplacian(Data.Map, x__, x__, y__, Map, x__, WithAbs);
  } else if (WithAbs)
    for (int i= x__; i< x__*(y__-1); i++)
      Map[i]= Abs(Data.Map[i-1]+Data.Map[i+1]+Data.Map[i-x__]+Data.Map[i+x__]
		  -4*Data.Map[i]);
  else
    for (int i= x__; i< x__*(y__-1); i++)
      Map[i]= Data.Map[i-1]+Data.Map[i+1]+Data.Map[i-x__]+Data.Map[i+x__]
	-4*Data.Map[i];
}

// ********************************* moravec **********************************

#ifdef ARCH_X86
static inline void MsseMoravec5x5_1(const unsigned char* In) {
  static mmx_t mmx_00fffff0 = {0x0000ffffffffff00ull};
  movq_m2r    (mmx_00fffff0,mm0); // mm0= 00 00 ff ff ff ff ff 00
  movq_m2r    (*In,mm1);          // mm1= y7 y6 y5 y4 y3 y2 y1 y0
  xorps_r2r   (xmm1,xmm1);
  xorps_r2r   (xmm2,xmm2);
  xorps_r2r   (xmm3,xmm3);
  xorps_r2r   (xmm4,xmm4);
  xorps_r2r   (xmm5,xmm5);
  xorps_r2r   (xmm6,xmm6);
  xorps_r2r   (xmm7,xmm7);
}
static inline void MsseMoravec5x5_2(const unsigned char* In) {
  movaps_r2r  (xmm5,xmm6);
  movaps_r2r  (xmm4,xmm5);
  movaps_r2r  (xmm3,xmm4);
  movaps_r2r  (xmm2,xmm3);
  movaps_r2r  (xmm1,xmm2);

  // mm0= 00 00 ff ff ff ff ff 00
  // mm1= y7 y6 y5 y4 y3 y2 y1 y0
  movq_r2r    (mm1,mm2);
  pand_r2r    (mm0,mm2);  // mm2= 00 00 y5 y4 y3 y2 y1 00

  movq_r2r    (mm1,mm3);
  psrlq_i2r   (8,mm3);
  pand_r2r    (mm0,mm3);  // mm3= 00 00 y6 y5 y4 y3 y2 00
  psadbw_r2r  (mm2,mm3);  // mm3= 0 0 0 |y2-y1|+|y3-y2|+|y4-y3|+|y5-y4|+|y6-y5|

  movq_m2r    (*In,mm1);  // mm1= z7 z6 z5 z4 z3 z2 z1 z0

  movq_r2r    (mm1,mm4);
  pand_r2r    (mm0,mm4);  // mm4= 00 00 z5 z4 z3 z2 z1 00
  psadbw_r2r  (mm2,mm4);  // mm4= 0 0 0 |z5-y5|+|z4-y4|+|z3-y3|+|z2-y2|+|z1-y1|
  punpckldq_r2r (mm4,mm3);  // mm3= sum|z(i)-y(i)| sum|y(i+1)-y(i)|
  cvtpi2ps_r2r (mm3,xmm0);

  movq_r2r    (mm1,mm3);
  psrlq_i2r   (8,mm3);
  pand_r2r    (mm0,mm3);  // mm3= 00 00 z6 z5 z4 z3 z2 00
  psadbw_r2r  (mm2,mm3);  // mm3= 0 0 0 |z6-y5|+|z5-y4|+|z4-y3|+|z3-y2|+|z2-y1|

  movq_r2r    (mm1,mm4);
  psllq_i2r   (8,mm4);
  pand_r2r    (mm0,mm4);  // mm4= 00 00 z4 z3 z2 z1 z0 00
  psadbw_r2r  (mm2,mm4);  // mm4= 0 0 0 |z4-y5|+|z3-y4|+|z2-y3|+|z1-y2|+|z0-y1|
  punpckldq_r2r (mm4,mm3);  // mm3= sum|z(i-1)-y(i)| sum|z(i+1)-y(i)|
  cvtpi2ps_r2r (mm3,xmm1);
  shufps_r2r  (xmm0,xmm1,0x44);

  // xmm1= sum|z(i)-y(i)| sum|y(i+1)-y(i)| sum|z(i-1)-y(i)| sum|z(i+1)-y(i)|
  subps_r2r   (xmm6,xmm7);
  addps_r2r   (xmm1,xmm7);
}
static inline void MsseMoravec5x5_3(float* Out) {
  movss_r2r   (xmm7,xmm0);
  shufps_r2r  (xmm7,xmm7,0x39);
  minss_r2r   (xmm7,xmm0);
  shufps_r2r  (xmm7,xmm7,0x39);
  minss_r2r   (xmm7,xmm0);
  shufps_r2r  (xmm7,xmm7,0x39);
  minss_r2r   (xmm7,xmm0);
  shufps_r2r  (xmm7,xmm7,0x39);
  movss_r2m   (xmm0,*Out);
}

static void MsseMoravec5x5(const unsigned char* In, int xStrideIn,
			   int x, int y, float* Out, int xStrideOut) {
  for (int xx= 3; xx< x-3; xx++) {
    const unsigned char* PteIn= In+(xx-3)+xStrideIn;
    MsseMoravec5x5_1(PteIn); PteIn+= xStrideIn;
    MsseMoravec5x5_2(PteIn); PteIn+= xStrideIn;
    MsseMoravec5x5_2(PteIn); PteIn+= xStrideIn;
    MsseMoravec5x5_2(PteIn); PteIn+= xStrideIn;
    MsseMoravec5x5_2(PteIn); PteIn+= xStrideIn;
    float* PteOut= Out+(xx-3)+3+3*xStrideOut;
    for (int yy= y-6; yy; yy--) {
      MsseMoravec5x5_2(PteIn);  PteIn+=  xStrideIn;
      MsseMoravec5x5_3(PteOut); PteOut+= xStrideOut;
    };
  };
}
#else
static void MsseMoravec5x5(const unsigned char* In, int xStrideIn,
			   int x, int y, float* Out, int xStrideOut) {}
#endif

void MFloatImage::Moravec5x5(const MCharImage& Data) {
  SetSize(Data.x(), Data.y());
  xMin_= 3; xMax_= x__-3;
  yMin_= 3; yMax_= y__-3;
  if (MWithMmxSseUse()) {
    MsseMoravec5x5(&Data.y_(0,0), x__, x__,y__, Map, x__);
    Memms();
    /*
    for (int yy= y__-2; yy; yy--) {
      int* Pte= (int*)Map+x__*yy;
      for (int xx= x__-2; xx; xx--)
	Pte[xx]= Data.Moravec5x5(xx,yy);
    };
    MsseMoravec5x5(Map, x__*y__);
    Memms();
    */
  } else {
    float Lookup[256*25];
    for (int i= 0; i< 256*25; i++)
      Lookup[i]= float(i);
    for (int yy= yMin_; yy< yMax_; yy++) {
      float* Pte= Map+yy*x__;
      for (int xx= xMin_; xx< xMax_; xx++)
	Pte[xx]= Lookup[Data.Moravec5x5(xx,yy)];
    };
  };
}


// *************************** 5x5 standart deviation *************************

#ifdef ARCH_X86
static inline void MsseInvStdDev5x5_1() {
  static mmx_t mmx_8888= {0x0080008000800080ull};
  movq_m2r      (mmx_8888,mm0);
  pxor_r2r      (mm1,mm1);
  xorps_r2r     (xmm1,xmm1);
  xorps_r2r     (xmm2,xmm2);
  xorps_r2r     (xmm3,xmm3);
  xorps_r2r     (xmm4,xmm4);
  xorps_r2r     (xmm5,xmm5);
  xorps_r2r     (xmm7,xmm7);
}
static inline void MsseInvStdDev5x5_2(const unsigned char* In) {
  movq_m2r      (*In,mm2);
  movq_r2r      (mm2,mm3);

  // ** using i3 i2 i1 i0 **
  punpcklbw_r2r (mm1,mm2);
  psubw_r2r     (mm0,mm2);      // mm6= i3 i2 i1 i0
  movq_r2r      (mm2,mm4);

  pmaddwd_r2r   (mm4,mm4);      // mm4= i3*i3+i2*i2  i1*i1+i0*i0
  pshufw_r2r    (mm4,mm5,0x4e); // mm5= i1*i1+i0*i0  i3*i3+i2*i2
  punpckldq_r2r (mm1,mm5);      // mm5= 0            i3*i3+i2*i2
  paddd_r2r     (mm4,mm5);      // mm5= i3*i3+i2*i2  i3*i3+i2*i2+i1*i1+i0*i0
  cvtpi2ps_r2r  (mm5,xmm0);
  shufps_r2r    (xmm0,xmm0,0x4e);
  // xmm0= i3*i3+i2*i2  i3*i3+i2*i2+i1*i1+i0*i0  *  *

  psrlq_i2r     (16,mm2);       // mm2= 0 i3 i2 i1
  pmaddwd_r2r   (mm2,mm2);      // mm2= i3*i3  i2*i2+i1*i1
  pshufw_r2r    (mm2,mm4,0x4e); // mm4= i2*i2+i1*i1 i3*i3
  punpckldq_r2r (mm1,mm4);      // mm4= 0           i3*i3
  paddd_r2r     (mm2,mm4);      // mm4= i3*i3  i3*i3+i2*i2+i1*i1
  cvtpi2ps_r2r  (mm4,xmm0);
  // xmm0= i3*i3+i2*i2  i3*i3+i2*i2+i1*i1+i0*i0  i3*i3  i3*i3+i2*i2+i1*i1

  // ** using i7 i6 i5 i4 **
  punpckhbw_r2r (mm1,mm3);
  psubw_r2r     (mm0,mm3);      // mm3= i7 i6 i5 i4
  movq_r2r      (mm3,mm2);

  pmaddwd_r2r   (mm3,mm3);      // mm3= i7*i7+i6*i6  i5*i5+i4*i4
  movq_r2r      (mm3,mm4);
  punpckldq_r2r (mm1,mm4);      // mm4= 0            i5*i5+i4*i4
  pshufw_r2r    (mm4,mm4,0x4e); // mm4= i5*i5+i4*i4  0
  paddd_r2r     (mm3,mm4);      // mm4= i7*i7+i6*i6+i5*i5+i4*i4  i5*i5+i4*i4
  cvtpi2ps_r2r  (mm4,xmm6);
  shufps_r2r    (xmm6,xmm6,0x4e);
  // xmm6= i7*i7+i6*i6+i5*i5+i4*i4  i5*i5+i4*i4  *  *

  psllq_i2r     (16,mm2);       // mm2= i6 i5 i4 0
  pmaddwd_r2r   (mm2,mm2);      // mm2= i5*i5+i6*i6  i4*i4
  movq_r2r      (mm2,mm3);
  punpckldq_r2r (mm1,mm3);      // mm3= 0            i4*i4
  pshufw_r2r    (mm3,mm3,0x4e); // mm3= i4*i4        0
  paddd_r2r     (mm2,mm3);      // mm3= i4*i4+i5*i5+i6*i6  i4*i4
  cvtpi2ps_r2r  (mm3,xmm6);
  shufps_r2r    (xmm6,xmm6,0x4e);
  // xmm6= i4*i4+i5*i5+i6*i6  i4*i4  i7*i7+i6*i6+i5*i5+i4*i4  i5*i5+i4*i4

  addps_r2r     (xmm6,xmm0);
  shufps_r2r    (xmm0,xmm0,0x72); // 72 = 01 11 00 10
  // xmm0= i7*i7+i6*i6+i5*i5+i4*i4+i3*i3  i6*i6+i5*i5+i4*i4+i3*i3+i2*i2
  //       i5*i5+i4*i4+i3*i3+i2*i2+i1*i1  i4*i4+i3*i3+i2*i2+i1*i1+i0*i0

  // ** using i7 i6 i5 i4 i3 i2 i1 i0 **
  addps_r2r     (xmm0,xmm7);
  subps_r2r     (xmm5,xmm7);

  movaps_r2r    (xmm4,xmm5);
  movaps_r2r    (xmm3,xmm4);
  movaps_r2r    (xmm2,xmm3);
  movaps_r2r    (xmm1,xmm2);
  movaps_r2r    (xmm0,xmm1);
}
static inline void MsseInvStdDev5x5_3(float* Out) {
  movaps_r2r    (xmm7,xmm6);
  xorps_r2r     (xmm0,xmm0);
  cmpneqps_r2r  (xmm6,xmm0); // xmm0= .. ((iSumQuare)? -1:0) ..
  rsqrtps_r2r   (xmm6,xmm6);
  andps_r2r     (xmm0,xmm6); // xmm6= .. ((iSumSquare)? 1/sqrt(iSumQuare):0) ..
  movntps_r2m   (xmm6,*Out);
}
#else
static inline void MsseInvStdDev5x5_1() {}
static inline void MsseInvStdDev5x5_2(const unsigned char* In) {}
static inline void MsseInvStdDev5x5_3(float* Out) {}
#endif

void MFloatImage::InverseStandartDeviation5x5(const MCharImage& Data,
					      int BorderSize) {
  SetSize(Data.x(), Data.y());
  xMin_= BorderSize; xMax_= x__-4-BorderSize;
  yMin_= BorderSize; yMax_= y__-4-BorderSize;
  if (MWithMmxSseUse()) {
    for (int yy, xx= x__/4-1; xx; xx--) {
      MsseInvStdDev5x5_1();
      const unsigned char* In= &Data.y_(0,0)+(xx-1)*4;
      for (yy= 4; yy; yy--) {
	MsseInvStdDev5x5_2(In);  In+= x__;
      };
      float* Out= Map+(xx-1)*4;
      for (yy= y__-4; yy; yy--) {
	MsseInvStdDev5x5_2(In);  In+= x__;
	MsseInvStdDev5x5_3(Out); Out+= x__;
      };
    };
    Memms();
  } else
    for (int yy= 0; yy< y__-4; yy++)
      for (int xx= 0; xx< x__-4; xx++) {
	int S= 0;
	for (int dy= 5; dy; dy--) {
	  const unsigned char* In= &Data.y_(xx-1,yy+dy-1);
	  for (int dx= 5; dx; dx--) {
	    int V= In[dx]-128;
	    S+= V*V;
	  };
	};
	M(xx,yy)= ((S)? 1/sqrt(float(S)):0);
      };
}

// ********************************* 5x5 ZNCC *********************************

#ifdef ARCH_X86
static inline void MsseZncc5x5_1() {
  static mmx_t mmx_8888= {0x0080008000800080ull};
  movq_m2r      (mmx_8888,mm0);
  pxor_r2r      (mm1,mm1);
  xorps_r2r     (xmm1,xmm1);
  xorps_r2r     (xmm2,xmm2);
  xorps_r2r     (xmm3,xmm3);
  xorps_r2r     (xmm4,xmm4);
  xorps_r2r     (xmm5,xmm5);
  xorps_r2r     (xmm7,xmm7);
}
static inline void MsseZncc5x5_2
                         (const unsigned char* In0, const unsigned char* In1) {
  movq_m2r      (*In0,mm4);
  movq_m2r      (*In1,mm5);

  // ** using i3 i2 i1 i0 and j3 j2 j1 j0 **
  movq_r2r      (mm4,mm6);
  movq_r2r      (mm5,mm7);
  punpcklbw_r2r (mm1,mm6);
  punpcklbw_r2r (mm1,mm7);
  psubw_r2r     (mm0,mm6);      // mm6= i3 i2 i1 i0
  psubw_r2r     (mm0,mm7);      // mm7= j3 j2 j1 j0
  movq_r2r      (mm6,mm2);
  movq_r2r      (mm7,mm3);

  pmaddwd_r2r   (mm6,mm7);      // mm7= i3*j3+i2*j2  i1*j1+i0*j0
  pshufw_r2r    (mm7,mm6,0x4e); // mm6= i1*j1+i0*j0  i3*j3+i2*j2
  punpckldq_r2r (mm1,mm6);      // mm6= 0            i3*j3+i2*j2
  paddd_r2r     (mm6,mm7);      // mm7= i3*j3+i2*j2  i3*j3+i2*j2+i1*j1+i0*j0
  cvtpi2ps_r2r  (mm7,xmm0);
  shufps_r2r    (xmm0,xmm0,0x4e);
  // xmm0= i3*j3+i2*j2  i3*j3+i2*j2+i1*j1+i0*j0  *  *

  psrlq_i2r     (16,mm2);       // mm2= 0 i3 i2 i1
  psrlq_i2r     (16,mm3);       // mm3= 0 j3 j2 j1
  pmaddwd_r2r   (mm2,mm3);      // mm3= i3*j3  i2*j2+i1*j1
  pshufw_r2r    (mm3,mm2,0x4e); // mm2= i2*j2+i1*j1 i3*j3
  punpckldq_r2r (mm1,mm2);      // mm2= 0           i3*j3
  paddd_r2r     (mm2,mm3);      // mm3= i3*j3  i3*j3+i2*j2+i1*j1
  cvtpi2ps_r2r  (mm3,xmm0);
  // xmm0= i3*j3+i2*j2  i3*j3+i2*j2+i1*j1+i0*j0  i3*j3  i3*j3+i2*j2+i1*j1

  // ** using i7 i6 i5 i4 and j7 j6 j5 j4 **
  punpckhbw_r2r (mm1,mm4);
  punpckhbw_r2r (mm1,mm5);
  psubw_r2r     (mm0,mm4);      // mm4= i7 i6 i5 i4
  psubw_r2r     (mm0,mm5);      // mm5= j7 j6 j5 j4
  movq_r2r      (mm4,mm2);
  movq_r2r      (mm5,mm3);

  pmaddwd_r2r   (mm4,mm5);      // mm5= i7*j7+i6*j6  i5*j5+i4*j4
  movq_r2r      (mm5,mm4);
  punpckldq_r2r (mm1,mm4);      // mm4= 0            i5*j5+i4*j4
  pshufw_r2r    (mm4,mm4,0x4e); // mm4= i5*j5+i4*j4  0
  paddd_r2r     (mm4,mm5);      // mm5= i7*j7+i6*j6+i5*j5+i4*j4  i5*j5+i4*j4
  cvtpi2ps_r2r  (mm5,xmm6);
  shufps_r2r    (xmm6,xmm6,0x4e);
  // xmm6= i7*j7+i6*j6+i5*j5+i4*j4  i5*j5+i4*j4  *  *

  psllq_i2r     (16,mm2);       // mm2= i6 i5 i4 0
  psllq_i2r     (16,mm3);       // mm3= j6 j5 j4 0
  pmaddwd_r2r   (mm2,mm3);      // mm3= i5*j5+i6*j6  i4*j4
  movq_r2r      (mm3,mm2);
  punpckldq_r2r (mm1,mm2);      // mm4= 0            i4*j4
  pshufw_r2r    (mm2,mm2,0x4e); // mm2= i4*j4        0
  paddd_r2r     (mm2,mm3);      // mm3= i4*j4+i5*j5+i6*j6  i4*j4
  cvtpi2ps_r2r  (mm3,xmm6);
  shufps_r2r    (xmm6,xmm6,0x4e);
  // xmm6= i4*j4+i5*j5+i6*j6  i4*j4  i7*j7+i6*j6+i5*j5+i4*j4  i5*j5+i4*j4

  addps_r2r     (xmm6,xmm0);
  shufps_r2r    (xmm0,xmm0,0x72); // 72 = 01 11 00 10
  // xmm0= i7*j7+i6*j6+i5*j5+i4*j4+i3*j3  i6*j6+i5*j5+i4*j4+i3*j3+i2*j2
  //       i5*j5+i4*j4+i3*j3+i2*j2+i1*j1  i4*j4+i3*j3+i2*j2+i1*j1+i0*j0

  // ** using i7 i6 i5 i4 i3 i2 i1 i0 and j7 j6 j5 j4 j3 j2 j1 j0 **
  addps_r2r     (xmm0,xmm7);
  subps_r2r     (xmm5,xmm7);

  movaps_r2r    (xmm4,xmm5);
  movaps_r2r    (xmm3,xmm4);
  movaps_r2r    (xmm2,xmm3);
  movaps_r2r    (xmm1,xmm2);
  movaps_r2r    (xmm0,xmm1);
}
static inline void MsseZncc5x5_3(const float*InvStdev0, const float*InvStdev1,
				 float* Out) {
  movups_m2r    (*InvStdev1,xmm0);
  mulps_r2r     (xmm7,xmm0);
  mulps_m2r     (*InvStdev0,xmm0);
  //movaps_r2r    (xmm7,xmm0);
  //mulps_m2r     (*InvStdev0,xmm0);
  //mulps_m2r     (*InvStdev1,xmm0);
  movntps_r2m   (xmm0,*Out);
}
#else
static inline void MsseZncc5x5_1() {}
static inline void MsseZncc5x5_2
                        (const unsigned char* In0, const unsigned char* In1) {}
static inline void MsseZncc5x5_3(const float*InvStdev0, const float*InvStdev1,
				 float* Out) {}
#endif

void MFloatImage::Zncc5x5(const MCharImage& Data0, const MCharImage& Data1,
		  const MFloatImage& InvStdDev0, const MFloatImage& InvStdDev1,
			  int x0, int y0, int x1, int y1, int dx, int dy) {
  assert(Data0.x()== InvStdDev0.x() && Data0.y()== InvStdDev0.y() &&
	 Data1.x()== InvStdDev1.x() && Data1.y()== InvStdDev1.y());
  SetSize(((dx+7)&~7), ((dy+1)&~1));
  if (!(1< x0 && x0+dx+2< Data0.x() && 1< y0 && y0+dy+2< Data0.y() &&
	1< x1 && x1+dx+2< Data1.x() && 1< y1 && y1+dy+2< Data1.y())) {
    memset(Map, 0, x__*y__*4);
    return;
  }

  xMin_= 0; xMax_= ((dx+3)&~3);
  yMin_= 0; yMax_= dy;
  x0-= 2; y0-= 2;
  x1-= 2; y1-= 2;
  if (MWithMmxSseUse()) {
    for (int yy, xx= xMax_/4; xx; xx--) {
      MsseZncc5x5_1();
      const unsigned char *In0= &Data0.y_(x0+4*(xx-1), y0),
	                  *In1 =&Data1.y_(x1+4*(xx-1), y1);
      for (yy= 4; yy; yy--) {
	MsseZncc5x5_2(In0, In1);
	In0+= Data0.x(); In1+= Data1.x();
      };
      const float *Inv0= &InvStdDev0.M_(x0+4*(xx-1), y0),
	          *Inv1= &InvStdDev1.M_(x1+4*(xx-1), y1);
      float* Out= Map+(xx-1)*4;
      for (yy= y__; yy; yy--) {
	MsseZncc5x5_2(In0, In1);
	In0+= Data0.x(); In1+= Data1.x();
	MsseZncc5x5_3(Inv0, Inv1, Out);
	Inv0+= Data0.x(); Inv1+= Data1.x(); Out+= x__;
      };
    };
    Memms();
  } else
    for (int yy= 0; yy< dy; yy++)
      for (int xx= 0; xx< dx; xx++) {
	int S0= 0, S1= 0, P= 0;
	for (int dyy= 5; dyy; dyy--) {
	  const unsigned char *In0= &Data0.y_(x0+xx-1, y0+yy+dyy-1),
	                      *In1= &Data1.y_(x1+xx-1, y1+yy+dyy-1);
	  for (int dxx= 5; dxx; dxx--) {
	    int V0= In0[dxx]-128, V1= In1[dxx]-128;
	    S0+= V0*V0; S1+= V1*V1; P+= V0*V1;
	  };
	};
	M(xx,yy)= ((P)? float(P)/sqrt(float(S0)*float(S1)):0);
      };
}


// ****************************************************************************

void MCompare(const MFloatImage& Map0, const MFloatImage& Map1) {
  int xMin0, xMax0, yMin0, yMax0,  xMin1, xMax1, yMin1, yMax1, x, y;
  Map0.Bound(xMin0, xMax0, yMin0, yMax0);
  Map1.Bound(xMin1, xMax1, yMin1, yMax1);
  assert(xMin0== xMin1 && xMax0== xMax1 && yMin0== yMin1 && yMax0== yMax1);
  for (int y= yMin0; y< yMax0; y++) {
    for (int x= xMin0; x< xMax0; x++)
      if (Abs(Map0.M_(x,y)-Map1.M_(x,y))>1e-3)
	cout << x << " " << y << "= " << Map0.M_(x,y) << " " << Map1.M_(x,y)
	     << " , ";
    // cout << endl << endl;
  };
}


//---------------------------------------------------------------------------------------------------
// MFloatImage::Harris
//---------------------------------------------------------------------------------------------------
#ifdef ARCH_X86

static inline void MsseHarris1(const signed short* InA, const signed short* InB,const signed short* InC,
		 float* Out,float *k4)
{
  static mmx_t mmx_0= {0x00000000};
  static mmx_t mmx_16={0x00000010};
  movq_m2r(mmx_0,mm6);

  //Conversion de InA en float
  movq_m2r(*InA,mm0);         //mm0= i3 i2 i1 i0 (2 octets par pixel)
  movq_r2r(mm6,mm7);          //mm7= 00 00 00 00
  punpcklwd_r2r(mm0,mm7);     //mm7= i100 i000
  psrad_m2r(mmx_16,mm7);      //mm7= iii1 iii0 valeurs en signed double word

  movq_r2r(mm6,mm1);          //mm1= 00 00 00 00
  punpckhwd_r2r(mm0,mm1);     //mm1= i300 i200
  psrad_m2r(mmx_16,mm1);      //mm1= iii3 iii2 valeurs en signed double word

  cvtpi2ps_r2r(mm1,xmm4);     //xmm4= 0000 0000 iii3 iii2
  cvtpi2ps_r2r(mm7,xmm0);     //xmm0= 0000 0000 iii1 iii0
  movlhps_r2r(xmm4,xmm0);     //xmm0= iii3 iii2 iii1 iii0 = float(A)

  //conversion de InB en float
  movq_m2r(*InB,mm0);         //mm0= i3 i2 i1 i0 (2 octets par pixel)
  movq_r2r(mm6,mm7);          //mm7= 00 00 00 00
  punpcklwd_r2r(mm0,mm7);     //mm7= i100 i000
  psrad_m2r(mmx_16,mm7);      //mm7= iii1 iii0 valeurs en signed double word

  movq_r2r(mm6,mm1);          //mm1= 00 00 00 00
  punpckhwd_r2r(mm0,mm1);     //mm1= i300 i200
  psrad_m2r(mmx_16,mm1);      //mm1= iii3 iii2 valeurs en signed double word

  cvtpi2ps_r2r(mm1,xmm4);     //xmm4= 0000 0000 iii3 iii2
  cvtpi2ps_r2r(mm7,xmm1);     //xmm1= 0000 0000 iii1 iii0
  movlhps_r2r(xmm4,xmm1);     //xmm1= iii3 iii2 iii1 iii0 = float(B)


  //conversion de InC en float
  movq_m2r(*InC,mm0);         //mm0= i3 i2 i1 i0 (2 octets par pixel)
  movq_r2r(mm6,mm7);          //mm7= 00 00 00 00
  punpcklwd_r2r(mm0,mm7);     //mm7= i100 i000
  psrad_m2r(mmx_16,mm7);      //mm7= iii1 iii0 valeurs en signed double word

  movq_r2r(mm6,mm1);          //mm1= 00 00 00 00
  punpckhwd_r2r(mm0,mm1);     //mm1= i300 i200
  psrad_m2r(mmx_16,mm1);      //mm1= iii3 iii2 valeurs en signed double word

  cvtpi2ps_r2r(mm1,xmm4);     //xmm4= 0000 0000 iii3 iii2
  cvtpi2ps_r2r(mm7,xmm2);     //xmm2= 0000 0000 iii1 iii0
  movlhps_r2r(xmm4,xmm2);     //xmm2= iii3 iii2 iii1 iii0 = float(C)

  //calcul de A*B-C*C-k*(A+B)
  //on met k k k k dans xmm7
  movaps_m2r(*k4,xmm7);
  movaps_r2r(xmm0,xmm3);
  addps_r2r(xmm1,xmm3);  //xmm3 = A+B
  mulps_r2r(xmm3,xmm3);  //xmm3 = (A+B)^2
  mulps_r2r(xmm7,xmm3);  //xmm3 = k*(A+B)^2
  mulps_r2r(xmm2,xmm2);  //xmm2 = C*C
  mulps_r2r(xmm0,xmm1);  //xmm1 = A*B
  subps_r2r(xmm2,xmm1);  //xmm1 = A*B-C*C
  subps_r2r(xmm3,xmm1);  //xmm1 = A*B-C*C-k*(A+B)

  movaps_r2m(xmm1,*Out);
}
static inline void MsseHarris(const signed short* InA, const signed short* InB, const signed short* InC,
		float *Out,int nbPix,float k)
{
  float k4[4];
  k4[0]=k;k4[1]=k;k4[2]=k;k4[3]=k;
  const signed short* PtInA=InA;
  const signed short* PtInB=InB;
  const signed short* PtInC=InC;
  float* PtOut=Out;

  for (int S=0;S<nbPix;S+=4)
    {
      MsseHarris1(PtInA,PtInB,PtInC,PtOut,k4);
      PtInA+=4;
      PtInB+=4;
      PtInC+=4;
      PtOut+=4;
    }
}
#else
static inline void MsseHarris(const signed short* inA, const signed short* inB, const signed short* inC,
		float *Out,int nbPix,float k) {}
#endif
void MFloatImage::Harris(const MSignedShortImage& DataA,const MSignedShortImage& DataB,const MSignedShortImage & DataC)
{
  SetSize(DataA.x(), DataA.y());
  xMin_= 3; xMax_= x__-3;
  yMin_= 3; yMax_= y__-3;
  float a,b,c,trace;
  float k=0.04;

  const signed short* InA= &DataA.M_(0,0);
  const signed short* InB= &DataB.M_(0,0);
  const signed short* InC= &DataC.M_(0,0);
  if (MWithMmxSseUse()==2) {
    MsseHarris(InA,InB,InC,Map, x__*y__,k);
    Memms(); // now FPU can be used
  }
  else {
    for (int S=x__*y__; S; S--)
      {
	a=float(InA[S]);
	b=float(InB[S]);
	c=float(InC[S]);
	trace=a+b;
	Map[S]=(a*b-c*c-k*trace*trace);
      }
  }
}

//---------------------------------------------------------------------------------------------------------------------
// VPInf
//---------------------------------------------------------------------------------------------------------------------
void MFloatImage::VPInf(const MSignedShortImage& DataA,const MSignedShortImage& DataB, const MSignedShortImage & DataC)
{
  SetSize(DataA.x(), DataA.y());
  xMin_= 3; xMax_= x__-3;
  yMin_= 3; yMax_= y__-3;
  float a,b,c,trace,delta;
  float k=0.04;

  const signed short* InA= &DataA.M_(0,0);
  const signed short* InB= &DataB.M_(0,0);
  const signed short* InC= &DataC.M_(0,0);
  /*if (MWithMmxSseUse()) {
    MsseVPInf(InA,InB,InC,Map, x__*y__,k);
    Memms(); // now FPU can be used
  }
  else*/
  {
		for (int S=x__*y__; S; S--)
		{
			a=float(InA[S]);
			b=float(InB[S]);
			c=float(InC[S]);
			trace=a+b;
			delta=trace*trace-4*(a*b-c*c);
			Map[S]=(trace-sqrt(delta))/2;
    }
  }
}

// *************************** local maxima list ******************************
#ifdef ARCH_X86
static inline void MsseLocalMaximaList_1(float* MinValue, const float* In,
					 int xStride) {
  movss_m2r  (*MinValue,xmm0);
  shufps_r2r (xmm0,xmm0,0);

  movaps_m2r (*In,xmm1);  // xmm1= i3 i2 i1 i0
  movaps_r2r (xmm1,xmm6);
  movaps_r2r (xmm1,xmm7);
  shufps_r2r (xmm6,xmm6,0x39);
  maxps_r2r  (xmm6,xmm1);
  shufps_r2r (xmm7,xmm7,0x93);
  maxps_r2r  (xmm7,xmm1); // xmm1= * max(i3,i2,i1) max(i2,i1,i0) *

  movaps_m2r (*(In+xStride),xmm2); // xmm2= j3 j2 j1 j0
  movaps_r2r (xmm2,xmm3);
  movaps_r2r (xmm2,xmm6);
  movaps_r2r (xmm2,xmm7);
  shufps_r2r (xmm6,xmm6,0x39);
  maxps_r2r  (xmm6,xmm3);
  shufps_r2r (xmm7,xmm7,0x93);
  maxps_r2r  (xmm7,xmm3); // xmm3= * max(j3,j2,j1) max(j2,j1,j0) *
}
static inline void MsseLocalMaximaList_2(const float* In, int* Out) {
  // xmm0= MinValue MinValue MinValue MinValue
  // xmm1= * max(i3,i2,i1) max(i2,i1,i0) *
  // xmm2= j3 j2 j1 j0,                   xmm3= * max(j3,j2,j1) max(j2,j1,j0) *
  cmpnleps_r2r (xmm4,xmm1); // xmm1= * (max(i3,i2,i1)<j2) (max(i2,i1,i0)<j1) *
  movaps_r2r  (xmm2,xmm4);
  shufps_r2r  (xmm4,xmm4,0x39);
  cmpltps_r2r (xmm2,xmm4); // xmm4= * (j3<j2) (j2<j1) *
  andps_r2r   (xmm4,xmm1); // xmm1= * max(j3,i3,i2,i1)<j2 max(j2,i2,i1,i0)<j1 *
  movaps_r2r  (xmm2,xmm4);
  shufps_r2r  (xmm4,xmm4,0x93);
  cmpltps_r2r (xmm2,xmm4); // xmm4= * (j1<j2) (j0<j1) *
  andps_r2r   (xmm4,xmm1);
  // xmm1= * (max(j3,i3,i2,i1,j1)<j2) (max(j2,i2,i1,i0,j0)<j1) *

  movaps_m2r (*In,xmm4);
  movaps_r2r (xmm4,xmm5); // xmm5= xmm4= k3 k2 k1 k0
  movaps_r2r (xmm4,xmm6);
  movaps_r2r (xmm4,xmm7);
  shufps_r2r (xmm6,xmm6,0x39);
  maxps_r2r  (xmm6,xmm4);
  shufps_r2r (xmm7,xmm7,0x93);
  maxps_r2r  (xmm7,xmm4); // xmm4= * max(k3,k2,k1) max(k2,k1,k0) *

  movaps_r2r   (xmm4,xmm6);
  cmpltps_r2r  (xmm2,xmm6); // xmm6= * (max(k3,k2,k1)<j2) (max(k2,k1,k0)<j1) *
  andps_r2r    (xmm6,xmm1);
  // xmm1= * max(j3,i3,i2,i1,j1,k3,k2,k1)<j2 max(j2,i2,i1,i0,j0,k3,k2,k1)<j1 *
  cmpnleps_r2r (xmm0,xmm2); // xmm2= * (MinValue<j2) (MinValue<j1) *
  andps_r2r    (xmm2,xmm1);
  // xmm1= * (max(MinValue,j3,i3,i2,i1,j1,k3,k2,k1)<j2)
  //         (max(MinValue,j2,i2,i1,i0,j0,k3,k2,k1)<j1) *
  andps_r2r    (xmm0,xmm1);
  shufps_r2r   (xmm1,xmm1,0x39);
  // xmm1= * * ((max(MinValue,j3,i3,i2,i1,j1,k3,k2,k1)<j2)? MinValue:0)
  //           ((max(MinValue,j2,i2,i1,i0,j0,k3,k2,k1)<j1)? MinValue:0)
  cvtps2pi_r2r (xmm1,mm0);
  pxor_r2r     (mm1,mm1);
  packssdw_r2r (mm1,mm0);
  movd_r2m     (mm0,*Out);

  movaps_r2r   (xmm3,xmm1);
  movaps_r2r   (xmm5,xmm2);
  movaps_r2r   (xmm6,xmm3);
}
#else
#endif

int MFloatImage::LocalMaximaList(float MinValue, int NbPointMax,
				 int* x_yMxStride) const {
  int NbPoint= 0;
  // if (MWithMmxSseUse()) {
  // } else
  /*
    for (int yy= yMin_+1; yy< yMax_-1; yy++) {
      float* P= Map+yy*x__+xMax_-1, a= Max(P[-x__], P[0], P[x__]),
	b0= P[-x__-1], b1= P[-1], b2= P[x__-1];
      P= Map+yy*x__+xMin_;
      for (int xx= xMax_-xMin_-2; xx; xx--) {
	float c0= P[-x__+xx], c1= P[xx], c2= P[x__+xx];
	if (MinValue<b1 && a<b1 && b0<b1 && b2<b1 && c0<b1 && c1<b1 && c2<b1) {
	  x_yMxStride[NbPoint++]= xMin_+xx+(yy<<MxShift);
	  if (NbPoint== NbPointMax)
	    return NbPoint;
	};
	a= Max(b0,b1,b2);
	b0= c0; b1= c1; b2= c2;
      };
    };
  */

	for (int yy= yMin_+1; yy< yMax_-1; yy++) {
		for (int xx= xMin_+1; xx< xMax_-1; xx++) {
			const float* P= Map+xx+yy*x__, V= P[0];
			if (MinValue< V && P[-1]<V && P[1]<V && P[x__]<V && P[-x__]<V &&
					P[-1+x__]<V && P[1+x__]<V && P[-1-x__]<V && P[1-x__]<V)
			{
				x_yMxStride[NbPoint++]= xx+(yy<<MxShift);
				if (NbPoint== NbPointMax)
				return NbPoint;
			};
		};
  };

  return NbPoint;
}

//
#ifdef ARCH_X86
static inline void MsseLocalMaxima1(float *res,float *PteIn,int xStride)
{
	movups_m2r(*PteIn,xmm0);                    //xmm0= i3 i2 i1 i0
	movups_m2r(*(PteIn+xStride+xStride),xmm2);  //xmm2= k3 k2 k1 k0
	maxps_r2r(xmm0,xmm2);                       //xmm2= max(i3,k3) max(i2,k2) max(i1,k1) max(i0,k0)

	movups_m2r(*(PteIn+xStride),xmm1);					//xmm1= j3         j2         j1         j0

	movaps_r2r(xmm1,xmm3);                      //xmm3= j3         j2         j1         j0
	shufps_r2r(xmm2,xmm3,0x6C);                 //xmm3= j3         j0         max(i1,k1) max(i2,k2)
  movaps_r2r(xmm2,xmm4);                      //xmm4= max(i3,k3) max(i2,k2) max(i1,k1) max(i0,k0)  
  shufps_r2r(xmm4,xmm4,0x9C);                 //xmm4= max(i3,k3) max(i0,k0) max(i2,k2) max(i1,k1)
  maxps_r2r(xmm4,xmm3);                       //xmm3= max(i3,j3,k3) max(i0,j0,k0) max(i2,k2,i1,k1) max(i2,k2,i1,k1)
  shufps_r2r(xmm1,xmm1,0x66);                 //xmm1= j1            j2            j1               j2
  maxps_r2r(xmm3,xmm1);       //xmm1= max(i3,j3,k3,j1)    max(i0,j0,k0,j2)    max(i2,k2,i1,j1,k1) max(i2,j2,k2,i1,k1)
  movaps_r2r(xmm1,xmm2);        
  shufps_r2r(xmm2,xmm2,0xEE); //xmm2= max(i2,k2,i1,j1,k1) max(i2,j2,k2,i1,k1) max(i2,k2,i1,j1,k1) max(i2,j2,k2,i1,k1)
  maxps_r2r(xmm1,xmm2);       //xmm2= max(i1,j1,k1,i2,k2,i3,j3,k3) max(i0,j0,k0,i1,k1,i2,j2,k2) * *
    
  movups_r2m(xmm2,*res);	
}
static inline void MsseLocalMaxima(MListePI& ListePoints,float MinValue,float *Map,int xStride,
																		int xmin,int xmax,int ymin,int ymax)
{
	float result[4];
	float *res=result;

	for (int yy=ymin+1;yy<ymax-1;yy++)
	{
		for (int xx=xmin+1;xx<xmax-2;xx+=2)	//Attention la fonction SSE est utilisee pour detecter 2 maximum consecutifs
		{
			float *PteIn=&(Map[xx+yy*xStride]);
      float *PteInAsm=&(Map[xx+yy*xStride-xStride-1]);
      if ( (PteIn[0]>MinValue) || (PteIn[1]>MinValue) )
      {
				MsseLocalMaxima1(res,PteInAsm,xStride);
				// au retour result[0] contient le maximum des voisins de PteIn[0] et result[1] la max des voisins de PteIn[1]
				if (PteIn[0]>result[0]) ListePoints.AddPoint(xx,yy);
				if (PteIn[1]>result[1]) ListePoints.AddPoint(xx+1,yy);
      }
		}	
	}	
}
#else
static inline void MsseLocalMaxima(MListePI& ListePoints,float MinValue,float *Map,int xStride,
																		int xmin,int xmax,int ymin,int ymax){}
#endif
void MFloatImage::LocalMaximaList(MListePI& ListePoints, float MinValue)
{
  /*if (MWithMmxSseUse())
  {
		//desactive car plus lent que la version C toute simple
		MsseLocalMaxima(ListePoints,MinValue,Map,x__,xMin_,xMax_,yMin_,yMax_);
	}
	else*/
	{
   for (int yy= yMin_+1; yy< yMax_-1; yy++)
		{
			for (int xx= xMin_+1; xx< xMax_-1; xx++)
			{
				const float* P= Map+xx+yy*x__, V= P[0];
				if (MinValue< V)
				{
					if (P[1]<V)
					{
						
						 if (P[-1]<V && P[x__]<V && P[-x__]<V && P[-1+x__]<V && P[1+x__]<V && P[-1-x__]<V && P[1-x__]<V)
						 {
							 ListePoints.AddPoint(xx,yy);
						 }
						 xx++;  //si P[0]>P[1] alors P[1] ne peut pas etre un max local, inutile de le tester
					};
				}
			};
		};
	}
}

//---------------------------------------------------------------------------------------------------
// MFloatImage::findMax
//---------------------------------------------------------------------------------------------------
float MFloatImage::findMax(int Zonexmin, int Zonexmax, int Zoneymin, int Zoneymax)
{
	int ymin,ymax,xmin,xmax;
  if ((Zonexmin==0) && (Zonexmax==0) && (Zoneymin==0) && (Zoneymax==0))
  {
    ymin=yMin_+1;
    ymax=yMax_-1;
    xmin=xMin_+1;
    xmax=xMax_-1;
  }
  else
  {
    //calcul des bornes de la fenetre de travail
    ymin=Max(yMin_,Zoneymin);
    ymax=Min(yMax_,Zoneymax);
    xmin=Max(xMin_,Zonexmin);
    xmax=Min(xMax_,Zonexmax);
  }
  
  float maxi=Map[xmin+ymin*x__];
  for (int yy=ymin; yy<ymax; yy++)
  {
    float *PteIn=Map+yy*x__;
    for (int xx=xmin; xx<xmax; xx++)
    {
      if (PteIn[xx]>maxi) maxi=PteIn[xx];
    }
  }
  return maxi;
}

//---------------------------------------------------------------------------------------------------
// MFloatImage::AddLocalMaximaZone
//---------------------------------------------------------------------------------------------------
int MFloatImage::AddLocalMaximaZone(float k,int NbPointMax,int* PointList,int PosListe,int& MeilleurMax,
				      int Zonexmin, int Zonexmax, int Zoneymin, int Zoneymax)
{
  //calcul des bornes de la fenetre de travail
  int ymin=Max(yMin_,Zoneymin);
  int ymax=Min(yMax_,Zoneymax);
  int xmin=Max(xMin_,Zonexmin);
  int xmax=Min(xMax_,Zonexmax);
  //Recherche du maximum dans la zone d'image
  float maxi=findMax(xmin,xmax,ymin,ymax);

  int NbPointTempMax=NbPointMax*2;
  int TempList[NbPointTempMax];
  
  //Recherche des maxima locaux qui dépassent k*maxi et ajout dans la liste temporaire
  int NbPoint=0;
  float MinValue=k*maxi;
  for (int yy= ymin+1; yy< ymax-1; yy++)
  {
      for (int xx= xmin+1; xx< xmax-1; xx++)
      {
				const float* P= Map+xx+yy*x__, V= P[0];
				if (MinValue< V && P[-1]<V && P[1]<V && P[x__]<V && P[-x__]<V &&
						P[-1+x__]<V && P[1+x__]<V && P[-1-x__]<V && P[1-x__]<V)
				{
					if (NbPoint<NbPointTempMax)
					{
						TempList[NbPoint]=xx+(yy<<MxShift);
						NbPoint++;
					}
				};
      };
    };

  //On retient les NbPointMax meilleurs maxima locaux et on les rajoute a la liste
  HeapSortMax(TempList,NbPoint);
  NbPoint=Min(NbPoint,NbPointMax);
  //On rajoute les meilleurs maxima dans la liste sauf le premier qui est retourne directement
  if (NbPoint>0) MeilleurMax=TempList[0];
  else MeilleurMax=-1;
  for (int i=1;i<NbPoint;i++)
  {
		PointList[PosListe+i-1]=TempList[i];
	}
  if (NbPoint>0) return (NbPoint+PosListe-1);
  else return PosListe;
}
void MFloatImage::HeapSortMax(int numbers[], int array_size)
{
  int i, temp;
  for (i = (array_size / 2)-1; i >= 0; i--)
  {
    HeapSortMaxsiftDown(numbers, i, array_size);
  }
  for (i = array_size-1; i >= 1; i--)
  {
    temp = numbers[0];
    numbers[0] = numbers[i];
    numbers[i] = temp;
    HeapSortMaxsiftDown(numbers, 0, i-1);
  }
}

void MFloatImage::HeapSortMaxsiftDown(int numbers[], int root, int bottom)
{
  int done, maxChild, temp;

  done = 0;
  while ((root*2 <= bottom) && (!done))
  {
    if (root*2 == bottom)
      maxChild = root * 2;
    else if (Map[ (numbers[root*2]&MxMask)+x__*(numbers[root * 2]>>MxShift)]
							< Map[(numbers[root*2+1]&MxMask)+x__*(numbers[root * 2+1]>>MxShift)])
      maxChild = root * 2;
    else
      maxChild = root * 2 + 1;
    if (Map[(numbers[root]&MxMask)+x__*(numbers[root]>>MxShift)]
					> Map[(numbers[maxChild]&MxMask)+x__*(numbers[maxChild]>>MxShift)])
    {
      temp = numbers[root];
      numbers[root] = numbers[maxChild];
      numbers[maxChild] = temp;
      root = maxChild;
    }
    else
      done = 1;
  }
}
//---------------------------------------------------------------------------------------------------
// MFloatImage::LocalMaximaList() par zones
//---------------------------------------------------------------------------------------------------
int MFloatImage::LocalMaximaList(float k, int NbPointMaxTotal,int NbPointMaxZone,int* PointList,int NbCasesX,int NbCasesY)
{
  int HauteurCase=y__/NbCasesY;
  int LargeurCase=x__/NbCasesX;
  int NbCases=NbCasesX*NbCasesY;

  int MeilleurMax[NbCases];	//on retient le meilleur max local pour chaque zone
   
  int PosListe=0;
  for (int i=0;i<NbCasesX;i++)
  {
    for (int j=0;j<NbCasesY;j++)
    {
      //Pour chaque case : ajout des NbPointMaxZone meilleurs maxima locaux
      PosListe=AddLocalMaximaZone(k,Min(NbPointMaxZone,NbPointMaxTotal-PosListe),PointList,PosListe,MeilleurMax[i+j*NbCasesX],
																	i*LargeurCase,(i+1)*LargeurCase,j*HauteurCase,(j+1)*HauteurCase);
      //cout << "(" << i << "," << j << "):" << PosListe << endl;
    }
  }
  //On retient les (NbPointMaxTotal-NbCases) meilleurs
  HeapSortMax(PointList,PosListe-1);
  int NbPoint;
  NbPoint=Min(PosListe-1,NbPointMaxTotal-NbCases);

  //On rajoute le meilleur max local de chaque zone s'il a ete elimine
  for (int i=0;i<NbCasesX;i++)
  {
    for (int j=0;j<NbCasesY;j++)
    {
			if (MeilleurMax[i+j*NbCasesX]!=-1)
			{
				PointList[NbPoint]=MeilleurMax[i+j*NbCasesX];
				NbPoint++;
			}
    }
  }
  return NbPoint;
}

//---------------------------------------------------------------------------------------------------
// Moyenneurs 5x5
//---------------------------------------------------------------------------------------------------
void MFloatImage::Somme5Y(MFloatImage& Data)
{
	SetSize(Data.x__,Data.y__);
 	for (int y=2;y<y__-2;y++)
	{
		float* PteIn=&(Data.Map[(y-2)*x__]);
    float* PteOut=&(Map[y*x__]);
		for (int x=0;x<x__;x++)
		{
			*PteOut=PteIn[0]+PteIn[x__]+PteIn[2*x__]+PteIn[3*x__]+PteIn[4*x__];
      PteOut++;
      PteIn++;
		}
	}
}
void MFloatImage::Somme5X(MFloatImage& Data)
{
	SetSize(Data.x__,Data.y__);
 	for (int x=2;x<x__-2;x++)
	{
		float* PteIn=&(Data.Map[x-2]);
    float* PteOut=&(Map[x]);
		for (int y=0;y<y__;y++)
		{
			*PteOut=PteIn[0]+PteIn[1]+PteIn[2]+PteIn[3]+PteIn[4];
      PteOut+=x__;
      PteIn+=x__;
		}
	}
}

void MFloatImage::Somme5x5(MFloatImage& Data)
{
	SetSize(Data.x__,Data.y__);
	MFloatImage Temp(x__,y__);
	Temp.Somme5X(Data);
	Somme5Y(Temp);
}


