// ****************************************************************************
// ******************************* MCharImage *********************************
// ****************************************************************************

#include "MCharImage.h"

MCharImage::MCharImage(int x_, int y_, int WithBlackInit):
                                                   x__(0), y__(0), yuv(NULL)  {
  SetSize(x_, y_);
  if (WithBlackInit && x__ && y__)
    FillBlack();
}
MCharImage::~MCharImage() {
  if (x__)
    MAlign16ByteDelete((int*)yuv);
}

void MCharImage::NoColor() {
  if (x__ && y__)
    memset(yuv+x__*y__, 128, x__*y__/2);
}
void MCharImage::FillBlack() {
  if (x__ && y__)
    memset(yuv, 0, x__*y__);
  NoColor();
}
void MCharImage::FillGray(unsigned char GrayLevel)
{
  if (x__ && y__)
    memset(yuv, GrayLevel, x__*y__);
  NoColor();
}

void MCharImage::FillBlackBorder(int BorderSize) {
  if (!x__ || !y__ || !BorderSize)
    return;
  assert(BorderSize< 9);
  memset(yuv,                      0, x__*BorderSize);
  memset(yuv+x__*(y__-BorderSize), 0, x__*BorderSize);
  if (MWithMmxSseUse()) {
#ifdef ARCH_X86
    static mmx_t mmx_ffffffff={0xffffffffffffffffull};
    movq_m2r (mmx_ffffffff,mm0);
    movq_m2r (mmx_ffffffff,mm1);
    switch (BorderSize) {
    case 1: psllq_i2r (8,mm0);  psrlq_i2r (8,mm1);  break;
    case 2: psllq_i2r (16,mm0); psrlq_i2r (16,mm1); break;
    case 3: psllq_i2r (24,mm0); psrlq_i2r (24,mm1); break;
    case 4: psllq_i2r (32,mm0); psrlq_i2r (32,mm1); break;
    case 5: psllq_i2r (40,mm0); psrlq_i2r (40,mm1); break;
    case 6: psllq_i2r (48,mm0); psrlq_i2r (48,mm1); break;
    case 7: psllq_i2r (56,mm0); psrlq_i2r (56,mm1); break;
    };
    unsigned char *OutLeft= yuv, *OutRight= yuv+x__-8;
    for (int yy= y__; yy; yy--) {
      movq_m2r   (*OutLeft,mm2);
      movq_m2r   (*OutRight,mm3);
      pand_r2r   (mm0,mm2);
      pand_r2r   (mm1,mm3);
      #ifdef __WITH_MOVNTQ__
      movntq_r2m (mm2,*OutLeft);
      movntq_r2m (mm3,*OutRight);
      #else
      movq_r2m (mm2,*OutLeft);
      movq_r2m (mm3,*OutRight);
      #endif
      OutLeft+=  x__;
      OutRight+= x__;
    };
    Memms();
#endif
 } else {
    int MaskRight, MaskLeft;
    switch (BorderSize&3) {
    case 0: MaskRight= 0xffffffff; MaskLeft= 0xffffffff; break;
    case 1: MaskRight= 0x00ffffff; MaskLeft= 0xffffff00; break;
    case 2: MaskRight= 0x0000ffff; MaskLeft= 0xffff0000; break;
    case 3: MaskRight= 0x000000ff; MaskLeft= 0xff000000; break;
    };
    int* P= (int*)yuv, x4= x__/4;
    if (3< BorderSize) {
      for (int yy= y__; yy; yy--) {
	P[0]= 0;    P[1]&=    MaskLeft;
	P[x4-1]= 0; P[x4-2]&= MaskRight;
	P+= x4;
      };
    } else
      for (int yy= y__; yy; yy--) {
	P[0]&=    MaskLeft;
	P[x4-1]&= MaskRight;
	P+= x4;
      };
  };
}

ostream& operator<<(ostream& Out, const MCharImage& Image) {
  Out << "MFloatImage " << Image.x__ << "x" << Image.y__ << endl;
  for (int yy= 0; yy< Image.y__; yy++) {
    for (int xx= 0; xx< Image.x__; xx++)
      Out << int(Image.y_(xx,yy)) << " ";
    Out << endl << endl;
  };
  Out << endl;
  return Out;
}


// ****************************************************************************

#ifdef ARCH_X86
static inline void MsseConvertFloatToChar_1(const float* Min_,
					    const float* Max_) {
  static sse_t sse_255 = { 255, 255, 255, 255 };
  movss_m2r  (*Min_,xmm0);
  movss_m2r  (*Max_,xmm2);
  shufps_r2r (xmm0,xmm0,0);  // xmm0= Min Min Min Min
  shufps_r2r (xmm2,xmm2,0);
  movaps_r2r (xmm2,xmm3);    // xmm3= Max Max Max Max
  subps_r2r  (xmm0,xmm2);
  movaps_m2r (sse_255,xmm1);
  divps_r2r  (xmm2,xmm1);    // xmm1= 255/(Max-Min) .. 255/(Max-Min)
}
static inline void MsseConvertFloatToChar_2(const float* In, uint8_t* Out) {
  movaps_m2r   (*In,xmm4);
  minps_r2r    (xmm3,xmm4);
  maxps_r2r    (xmm0,xmm4);
  subps_r2r    (xmm0,xmm4);
  mulps_r2r    (xmm1,xmm4);
  // xmm4= (255/(Max-Min))*(max(Min, min(Max,(*In)))-Min)
  cvtps2pi_r2r (xmm4,mm0); // mm0= f1 f0
  shufps_r2r   (xmm4,xmm4,0x4e);
  cvtps2pi_r2r (xmm4,mm1); // mm1= f3 f2
  packssdw_r2r (mm1,mm0);  // mm0= f3 f2 f1 f0

  movaps_m2r   (*(In+4),xmm4);
  minps_r2r    (xmm3,xmm4);
  maxps_r2r    (xmm0,xmm4);
  subps_r2r    (xmm0,xmm4);
  mulps_r2r    (xmm1,xmm4);
  // xmm4= (255/(Max-Min))*(max(Min, min(Max,(*(In+4))))-Min)
  cvtps2pi_r2r (xmm4,mm2); // mm2= f5 f4
  shufps_r2r   (xmm4,xmm4,0x4e);
  cvtps2pi_r2r (xmm4,mm3); // mm4= f7 f6
  packssdw_r2r (mm3,mm2);  // mm2= f7 f6 f5 f4

  packuswb_r2r (mm2,mm0);  // mm0= f7 f6 f5 f4 f3 f2 f1 f0
  #ifdef __WITH_MOVNTQ__
  movntq_r2m   (mm0,*Out);
  #else
  movq_r2m   (mm0,*Out);
  #endif
}
static inline void MsseConvertFloatToChar(const float* Min_, const float* Max_,
				   const float* In, int Length, uint8_t* Out) {
  MsseConvertFloatToChar_1(Min_, Max_);
  for (int i= Length/8; i; i--) {
    MsseConvertFloatToChar_2(In, Out);
    In+= 8;
    Out+= 8;
  };
}
#else
static inline void MsseConvertFloatToChar(const float* Min_, const float* Max_,
				  const float* In, int Length, uint8_t* Out) {}
#endif

void MCharImage::ConvertFrom(const MFloatImage&Image, float Min_, float Max_) {
  SetSize(Image.x(), Image.y());
  if (MWithMmxSseUse()) {
    MsseConvertFloatToChar(&Min_, &Max_, &Image.M_(0,0), x__*y__, yuv);
    Memms(); // now FPU can be used
  } else {
    const float *In= &Image.M_(0,0)-1;
    unsigned char *Out= yuv-1;
    for (int i= x__*y__; i; i--)
      yuv[i]=
	(unsigned char)(255/(Max_-Min_)*(Min(Max_, Max(Min_, In[i]))-Min_));
  };
  memset(yuv+x__*y__, 128, x__*y__/2);
}

// ***************************** draw local maxima ****************************

#ifdef ARCH_X86
static inline void MmmxDrawLocalMaxima_1() {
  pxor_r2r      (mm0,mm0);
}
static inline void MmmxDrawLocalMaxima_2(const unsigned char* In,
					 unsigned char* Out) {
  movq_m2r      (*In,mm1);
  movq_r2r      (mm1,mm2);
  punpcklbw_r2r (mm0,mm1);
  punpckhbw_r2r (mm0,mm2);
  psrlw_i2r     (1,mm1);
  psrlw_i2r     (1,mm2);
  packuswb_r2r  (mm2,mm1);
  #ifdef __WITH_MOVNTQ__
  movntq_r2m    (mm1,*Out);
  #else
  movq_r2m    (mm1,*Out);
  #endif

}
#else
static inline void MmmxDrawLocalMaxima_1() {}
static inline void MmmxDrawLocalMaxima_2(const unsigned char* In,
					 unsigned char* Out) {}
#endif

void MCharImage::DrawLocalMaxima(const MCharImage& Data, int NbPoint,
				 int* x_yMxShift) {
  SetSize(Data.x__, Data.y__);
  if (MWithMmxSseUse()) {
    MmmxDrawLocalMaxima_1();
    const unsigned char* In= Data.yuv-8;
    unsigned char* Out= yuv-8;
    for (int i= x__*y__; i; i-= 8)
      MmmxDrawLocalMaxima_2(In+i, Out+i);
    Memms(); // now FPU can be used
  } else
    for (int i= x__*y__; i; i--)
      yuv[i-1]= (Data.yuv[i-1]>>1);
  for (int i= 0; i< NbPoint; i++)
    yuv[(x_yMxShift[i]&MxMask)+x__*(x_yMxShift[i]>>MxShift)]= 255;
}


// ****************************************************************************
void MCharImage::CopyDiv2(const MCharImage& Data)
{
  SetSize(Data.x__, Data.y__);
  if (MWithMmxSseUse())
  {
    MmmxDrawLocalMaxima_1();
    const unsigned char* In= Data.yuv-8;
    unsigned char* Out= yuv-8;
    for (int i= x__*y__; i; i-= 8)
      MmmxDrawLocalMaxima_2(In+i, Out+i);
    Memms(); // now FPU can be used
  }
  else
  {
    for (int i= x__*y__; i; i--) yuv[i-1]= (Data.yuv[i-1]>>1);
  }
}

// ****************************************************************************

int MCharImage::Load(FILE* Flux) {
  int xx,yy,zz;
  char C= fgetc(Flux);
  if (C!= 'P')                                return 0;
  C= fgetc(Flux);
  if (C!= '5')                                return 0;
  if (!fscanf(Flux, "%d %d %d", &xx,&yy,&zz)) return 0;
  if (xx== 0 || yy== 0)                       return 0;
  SetSize(xx, (yy*2)/3);
  fgetc(Flux); // skip return
  return fread(yuv, 1, xx*yy, Flux);
}
int MCharImage::Load(const char* PpmOrPgm) {
  int IsColored= 0;
  ifstream Image(PpmOrPgm);
  if (Image.bad())
    return 0;
  char C;
  while ((C = Image.get()) == '#') { do C = Image.get(); while (C != '\n'); };
  Image.putback(C);

  char Tmp[256];
  Image.get(Tmp, 256, '\n');
  Image.get();
  if (!strcmp(Tmp, "P5"))                            IsColored= 0;
  else if (!strcmp(Tmp, "P6") || !strcmp(Tmp, "P3")) IsColored= 1;
  else return 0;

  while ((C = Image.get()) == '#') { do C = Image.get(); while (C != '\n'); };
  Image.putback(C);

  int xx, yy, zz;
  Image >> xx >> yy >> zz;
  if ((xx <= 0) || (yy <= 0) || (zz!= 255 && zz!= 254))
    return 0;
  SetSize(xx,yy);
  Image.get();

  int L= x__*y__;
  if (!IsColored)
    Image.read((char*)yuv, L);
  else if (!strcmp(Tmp, "P6")) { // color binary
    char* Tmp2= new char[3*L];
    Image.read(Tmp2, 3*L);
    for (int i= 0; i< L; i++)
      yuv[i]= Tmp2[3*i+1];
    delete[] Tmp2;
  } else if (!strcmp(Tmp, "P3")) { // color ascii
    int R, G, B;
    for (int i= 0; i< L; i++) {
      Image >> R >> G >> B;
      yuv[i]= G;
    };
  };
  NoColor();
  return 1;
}
const MCharImage& MCharImage::operator=(const MCharImage& Image) {
  assert(Image.x__ && Image.y__);
  SetSize(Image.x__, Image.y__);
  memcpy(yuv, Image.yuv, (x__*y__*3)/2);
  return *this;
}
void MCharImage::PointMark(int NbPoint,int*PointIndices,int WithWhite1Black2) {
  for (int i= 0; i< NbPoint; i++) {
    int xx= PointIndices[i]%MxStride, yy= (PointIndices[i]>>MxShift);
    if (xx && yy && xx+1< x__ && yy+1< y__) {
      int xMin= Max(0, xx-2), xMax= Min(xx+3, x__),
	  yMin= Max(0, yy-2), yMax= Min(yy+3, y__);
      if (WithWhite1Black2&1) {
	for (int x0= xMin; x0< xMax; x0++)
	  y(x0,yy)= (unsigned char)240;
	for (int y0= yMin; y0< yMax; y0++)
	  y(xx,y0)= (unsigned char)240;
      };
      if (WithWhite1Black2&2) {
	xx++; yy++; xMin++; yMin++;
	xMax= Min(x__, xMax+1); yMax= Min(y__, yMax+1);
	for (int x0= xMin; x0< xMax; x0++)
	  y(x0,yy)= (unsigned char)16;
	for (int y0= yMin; y0< yMax; y0++)
	  y(xx,y0)= (unsigned char)16;
      };
    };
  };
}

void MCharImage::Put(const MCharImage& Data, int xx, int yy) {
  assert(0<= xx && xx+Data.x__<= x__ && 0<= yy && yy+Data.y__<= y__);
  assert(!(xx&1) && !(yy&1));
  for (int y0= 0; y0< Data.y__; y0++)
    memcpy(yuv+xx+(yy+y0)*x__, Data.yuv+y0*Data.x__, Data.x__);
  for (int y1= 0; y1< Data.y__/2; y1++) {
    unsigned char *In=  Data.yuv+(Data.y__+y1)*Data.x__,
                  *Out= yuv+(y__+yy/2+y1)*x__+xx/2;
    memcpy(Out,       In,            Data.x__/2);
    memcpy(Out+x__/2, In+Data.x__/2, Data.x__/2);
  };
}

// ****************************** rotate left *********************************

#ifdef ARCH_X86
static inline void MmmxRotate_1a() {
  static mmx_t mmx_0000000f={0x00000000000000ffull};
  movq_m2r (mmx_0000000f,mm2);
}
static inline void MmmxRotate_1b() {
  static mmx_t mmx_f0000000={0xff00000000000000ull};
  movq_m2r (mmx_f0000000,mm2);
}
static inline void MmmxRotate_2a(const unsigned char*& In, int xStrideIn) {
  movq_r2r  (*In,mm0); // mm0= * * * * a3 a2 a1 a0
  In+= xStrideIn;

  movq_r2r  (mm0,mm1);
  pand_r2r  (mm2,mm1);
  psllq_r2r (8,mm4);
  por_r2r   (mm1,mm4); // mm4= * * * * * *  *  a0
  psrlq_r2r (8,mm0);   // mm0= 0 * * * * a3 a2 a1

  movq_r2r  (mm0,mm1);
  pand_r2r  (mm2,mm1);
  psllq_r2r (8,mm5);
  por_r2r   (mm1,mm5); // mm5= * * * * * * *  a1
  psrlq_r2r (8,mm0);   // mm0= 0 0 * * * * a3 a2

  movq_r2r  (mm0,mm1);
  pand_r2r  (mm2,mm1);
  psllq_r2r (8,mm6);
  por_r2r   (mm1,mm6); // mm6= * * * * * * *  a2
  psrlq_r2r (8,mm0);   // mm0= 0 * * * * * a3 a2

  pand_r2r  (mm2,mm0);
  psllq_r2r (8,mm7);
  por_r2r   (mm0,mm7); // mm7= * * * * * * *  a3
}
static inline void MmmxRotate_2b(const unsigned char*& In, int xStrideIn) {
  movq_r2r  (*In,mm0); // mm0= a3 a2 a1 a0 * * * *
  In+= xStrideIn;

  movq_r2r  (mm0,mm1);
  pand_r2r  (mm2,mm1);
  psrlq_r2r (8,mm4);
  por_r2r   (mm1,mm4); // mm4= a3 *  *  * * * * *
  psllq_r2r (8,mm0);   // mm0= a2 a1 a0 * * * * 0

  movq_r2r  (mm0,mm1);
  pand_r2r  (mm2,mm1);
  psrlq_r2r (8,mm5);
  por_r2r   (mm1,mm5); // mm5= a2 *  * * * * * *
  psllq_r2r (8,mm0);   // mm0= a1 a0 * * * * 0 0

  movq_r2r  (mm0,mm1);
  pand_r2r  (mm2,mm1);
  psrlq_r2r (8,mm6);
  por_r2r   (mm1,mm6); // mm6= a1 * * * * * * *
  psllq_r2r (8,mm0);   // mm0= a0 * * * * 0 0 0

  pand_r2r  (mm2,mm0);
  psrlq_r2r (8,mm7);
  por_r2r   (mm0,mm7); // mm7= a0 * * * * * * *
}
static inline void MmmxRotate_3a(const unsigned char*& Out, int xStrideOut) {
  #ifdef __WITH_MOVNTQ__
  movntq_r2m (mm7,*Out); // mm7= a3 b3 c3 d3 e3 f3 g3 h3
  Out+= xStrideOut;
  movntq_r2m (mm6,*Out); // mm6= a2 b2 c2 d2 e2 f2 g2 h2
  Out+= xStrideOut;
  movntq_r2m (mm5,*Out); // mm5= a1 b1 c1 d1 e1 f1 g1 h1
  Out+= xStrideOut;
  movntq_r2m (mm4,*Out); // mm4= a0 b0 c0 d0 e0 f0 g0 h0
  Out+= xStrideOut;
  #else
  movq_r2m (mm7,*Out); // mm7= a3 b3 c3 d3 e3 f3 g3 h3
  Out+= xStrideOut;
  movq_r2m (mm6,*Out); // mm6= a2 b2 c2 d2 e2 f2 g2 h2
  Out+= xStrideOut;
  movq_r2m (mm5,*Out); // mm5= a1 b1 c1 d1 e1 f1 g1 h1
  Out+= xStrideOut;
  movq_r2m (mm4,*Out); // mm4= a0 b0 c0 d0 e0 f0 g0 h0
  Out+= xStrideOut;
  #endif
}
static inline void MmmxRotate_3b(const unsigned char*& Out, int xStrideOut) {
  #ifdef __WITH_MOVNTQ__
  movntq_r2m (mm4,*Out); // mm4= a0 b0 c0 d0 e0 f0 g0 h0
  Out+= xStrideOut;
  movntq_r2m (mm5,*Out); // mm5= a1 b1 c1 d1 e1 f1 g1 h1
  Out+= xStrideOut;
  movntq_r2m (mm6,*Out); // mm6= a2 b2 c2 d2 e2 f2 g2 h2
  Out+= xStrideOut;
  movntq_r2m (mm7,*Out); // mm7= a3 b3 c3 d3 e3 f3 g3 h3
  Out+= xStrideOut;
  #else
  movq_r2m (mm4,*Out); // mm4= a0 b0 c0 d0 e0 f0 g0 h0
  Out+= xStrideOut;
  movq_r2m (mm5,*Out); // mm5= a1 b1 c1 d1 e1 f1 g1 h1
  Out+= xStrideOut;
  movq_r2m (mm6,*Out); // mm6= a2 b2 c2 d2 e2 f2 g2 h2
  Out+= xStrideOut;
  movq_r2m (mm7,*Out); // mm7= a3 b3 c3 d3 e3 f3 g3 h3
  Out+= xStrideOut;
  #endif
}
#else
static inline void MmmxRotate_1a() {}
static inline void MmmxRotate_1b() {}
static inline void MmmxRotate_2a(const unsigned char*& In, int xStrideIn) {}
static inline void MmmxRotate_2b(const unsigned char*& In, int xStrideIn) {}
static inline void MmmxRotate_3a(const unsigned char*& Out, int xStrideOut) {}
static inline void MmmxRotate_3b(const unsigned char*& Out, int xStrideOut) {}
#endif

void MCharImage::RotateLeft(const MCharImage& Data) {
  SetSize(Data.y__, Data.x__);
  const unsigned char* PteIn= Data.yuv;
  unsigned char* PteOut= yuv;
  int xx, yy;
  for (yy= 0; yy< Data.y__; yy++)
    for (xx= 0; xx< Data.x__; xx++)
      PteOut[yy+Data.y__*(Data.x__-1-xx)]= PteIn[xx+yy*Data.x__];
  PteIn=  Data.yuv+Data.y__*Data.x__;
  PteOut= yuv+Data.y__*Data.x__;
  for (yy= 0; yy< Data.y__/2; yy++)
    for (xx= 0; xx< Data.x__/2; xx++)
      PteOut[yy+Data.y__*(Data.x__/2-1-xx)]= PteIn[xx+yy*Data.x__];
  PteIn=  Data.yuv+Data.y__*Data.x__+Data.x__/2;
  PteOut= yuv+Data.y__*Data.x__+Data.y__/2;
  for (yy= 0; yy< Data.y__/2; yy++)
    for (xx= 0; xx< Data.x__/2; xx++)
      PteOut[yy+Data.y__*(Data.x__/2-1-xx)]= PteIn[xx+yy*Data.x__];
}
void MCharImage::RotateRight(const MCharImage& Data) {
  SetSize(Data.y__, Data.x__);
  const unsigned char* PteIn= Data.yuv;
  unsigned char* PteOut= yuv;
  int xx, yy;
  for (yy= 0; yy< Data.y__; yy++)
    for (xx= 0; xx< Data.x__; xx++)
      PteOut[(Data.y__-1-yy)+Data.y__*xx]= PteIn[xx+yy*Data.x__];
  PteIn=  Data.yuv+Data.y__*Data.x__;
  PteOut= yuv+Data.y__*Data.x__;
  for (yy= 0; yy< Data.y__/2; yy++)
    for (xx= 0; xx< Data.x__/2; xx++)
      PteOut[(Data.y__/2-1-yy)+Data.y__*xx]= PteIn[xx+yy*Data.x__];
  PteIn=  Data.yuv+Data.y__*Data.x__+Data.x__/2;
  PteOut= yuv+Data.y__*Data.x__+Data.y__/2;
  for (yy= 0; yy< Data.y__/2; yy++)
    for (xx= 0; xx< Data.x__/2; xx++)
      PteOut[(Data.y__/2-1-yy)+Data.y__*xx]= PteIn[xx+yy*Data.x__];
}

// ******************************* reduce 2x2 *********************************

#ifdef ARCH_X86
static inline void Mmmx2x2ReduceChar(uint8_t* Line0, uint8_t* Line1,
				     uint8_t* Image) {
  static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffull}; // use mm5 to store it
  movq_m2r     (*Line0,mm0);  // mm0= v7 v6 v5 v4 v3 v2 v1 v0
  movq_m2r     (*Line1,mm1);  // mm1= w7 w6 w5 w4 w3 w2 w1 w2
  movq_r2r     (mm0,mm2);
  movq_r2r     (mm1,mm3);
  psrlw_i2r    (8,mm2);       // mm2= 00 v7 00 v5 00 v3 00 v1
  psrlw_i2r    (8,mm3);       // mm3= 00 w7 00 w5 00 w3 00 w1
  // pavgusb_r2r (mm0,mm2);   // illegal instruction for I7500 !
  // pavgusb_r2r (mm1,mm3);
  // pavgusb_r2r (mm2,mm3);   // mm3= 00 (v6+w6+v7+w7)/4 ... 00 (v0+w0+v1+w1)/4
  movq_m2r  (mmx_00ffw, mm4); // mm4= 00 ff 00 ff 00 ff 00 ff
  pand_r2r     (mm4,mm0);     // mm0= 00 v6 00 v4 00 v2 00 v0
  pand_r2r     (mm4,mm1);     // mm1= 00 w6 00 w4 00 w2 00 w0
  paddw_r2r    (mm0,mm2);
  paddw_r2r    (mm1,mm3);
  paddw_r2r    (mm2,mm3);
  psrlw_i2r    (2, mm3);      // mm3= 00 (v6+w6+v7+w7)/4 ... 00 (v0+w0+v1+w1)/4
  packuswb_r2r (mm3,mm3);     // mm3.d= (v6+w6+v7+w7)/4 ... (v0+w0+v1+w1)/4
  movd_r2m     (mm3,*Image);  // store mm3.d
};

static inline void Mmmx2x2ReduceChar(uint8_t* In0, int InStride,
		    uint8_t* Out, int OutStride, int OutWidth, int OutHeight) {
  uint8_t* In1= In0+InStride;
  OutWidth&= ~3;
  InStride= 2*InStride-2*OutWidth;
  OutStride-= OutWidth;
  OutWidth>>= 2;
  for (int j= OutHeight; j; j--) {
    for (int i= OutWidth; i; i--) {
      Mmmx2x2ReduceChar(In0, In1, Out);
      In0+= 8;
      In1+= 8;
      Out+= 4;
    };
    In0+= InStride;
    In1+= InStride;
    Out+= OutStride;
  };
}
#else
static inline void Mmmx2x2ReduceChar(uint8_t* In0, int InStride,
                   uint8_t* Out, int OutStride, int OutWidth, int OutHeight) {}
#endif

void MCharImage::Reduce2x2(const MCharImage& Data) {
  SetSize(((Data.x__/2)&(~7)), ((Data.y__/2)&(~1)));
  assert(x__ && y__);
  if (MWithMmxSseUse()) {
    Mmmx2x2ReduceChar(Data.yuv, Data.x__, yuv, x__, x__, y__);
    int S= Data.x__*Data.y__, S2= x__*y__;
    Mmmx2x2ReduceChar(Data.yuv+S, Data.x__, yuv+S2, x__, x__/2, y__/2);
    Mmmx2x2ReduceChar(Data.yuv+S+Data.x__/2, Data.x__,
		      yuv+S2+x__/2, x__, x__/2, y__/2);
    Memms(); // now, FPU can be used
  } else assert(0); // not yet done !
}

// ****************************************************************************

void MCharImage::ConvertTo16BitScreen(short*Color16Bit,int BytesPerLine) const{
  if (MWithMmxSseUse()) {
    yuv420_rgb16((uint8_t*)Color16Bit, yuv, yuv+y__*x__, yuv+y__*x__+x__/2,
		 x__,y__, BytesPerLine,x__,x__, 1); // 2*x__ before
    Memms(); // now, FPU can be used
  } else // do not use MMX instructions (8 times slower)
    for (int yy= 0; yy< y__; yy++) {
      const unsigned char *PteIn= yuv+x__*yy, *PteIn2= yuv+x__*(y__+yy/2);
      short* PteOut= Color16Bit+(BytesPerLine/2)*yy;
      for (int xx= 0; xx< x__; xx++) {
	int x2= xx/2;
	float y=255.*(PteIn[xx]-16)/219.,
	      u=255.*(PteIn2[x2]-128)/224., v=255.*(PteIn2[x2+x__/2]-128)/224.;
	short R= short(Max(0.,Min(255., y+1.40199*v))),
	      G= short(Max(0.,Min(255., y-.344113*u-0.714104*v))),
	      B= short(Max(0.,Min(255., y+1.77198*u-.000134583*v)));
	PteOut[xx]= (short)(((R<<8)&0xf800)|((G<<3)&0x07e0)|((B>>3)&0x001f));
      };
    };
}

// ********************************* Smooth121 ********************************

#ifdef ARCH_X86
static inline void Mmmx121ySmooth_1(const unsigned char* In, int xStride) {
  static mmx_t mmx_2222= {0x0002000200020002ull};
  movq_m2r      (mmx_2222,mm7);
  pxor_r2r      (mm0,mm0);  // mm0= 0
  movq_m2r      (*In,mm1);  // mm1= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r      (mm1,mm2);
  punpcklbw_r2r (mm0,mm1);  // mm1= 00 i3 00 i2 00 i1 00 i0
  punpckhbw_r2r (mm0,mm2);  // mm2= 00 i7 00 i6 00 i5 00 i4

  movq_m2r      (*(In+xStride),mm3);
  movq_r2r      (mm3,mm4);
  punpcklbw_r2r (mm0,mm3);  // mm3= 00 j3 00 j2 00 j1 00 j0
  punpckhbw_r2r (mm0,mm4);  // mm4= 00 j7 00 j6 00 j5 00 j4
}
static inline void Mmmx121ySmooth_2(const unsigned char*In,unsigned char*Out) {
  movq_m2r      (*In,mm5);  // (*(Out+xStride),mm5);
  movq_r2r      (mm5,mm6);
  punpcklbw_r2r (mm0,mm5);  // mm5= 00 k3 00 k2 00 k1 00 k0
  punpckhbw_r2r (mm0,mm6);  // mm6= 00 k7 00 k6 00 k5 00 k4

  paddusw_r2r   (mm3,mm1);
  paddusw_r2r   (mm4,mm2);
  paddusw_r2r   (mm3,mm1);
  paddusw_r2r   (mm4,mm2);
  paddusw_r2r   (mm5,mm1);
  paddusw_r2r   (mm6,mm2);
  paddw_r2r     (mm7,mm1);
  paddw_r2r     (mm7,mm2);
  psrlw_i2r     (2,mm1);   // mm1= 0 (i3+j3+j3+k3+2)>>2 .. 0 (i0+j0+j0+k0+2)>>2
  psrlw_i2r     (2,mm2);   // mm2= 0 (i7+j7+j7+k7+2)>>2 .. 0 (i4+j4+j4+k4+2)>>2
  packuswb_r2r  (mm2,mm1); // mm1= (i7+j7+j7+k7+2)>>2 ... (i0+j0+j0+k0+2)>>2
  #ifdef __WITH_MOVNTQ__
  movntq_r2m    (mm1,*Out);
  #else
  movq_r2m    (mm1,*Out);
  #endif
  movq_r2r      (mm3,mm1);
  movq_r2r      (mm4,mm2);
  movq_r2r      (mm5,mm3);
  movq_r2r      (mm6,mm4);
}
static inline void Mmmx121ySmooth(const unsigned char* In, int xStrideIn,
			    int x, int y, unsigned char* Out, int xStrideOut) {
  for (int xx= x/8; xx; xx--) {
    const unsigned char* PteIn= In+8*(xx-1);
    Mmmx121ySmooth_1(PteIn, xStrideIn);
    PteIn+= (xStrideIn+xStrideIn);
    unsigned char* PteOut= Out+8*(xx-1)+xStrideOut;
    for (int yy= y-2; yy; yy--) {
      Mmmx121ySmooth_2(PteIn, PteOut);
      PteIn+=  xStrideIn;
      PteOut+= xStrideOut;
    };
  };
}


static inline void Mmmx121ySmooth_pavgb1(const unsigned char *In,unsigned char *Out,int xStride)
{
  movq_m2r(*In,mm0); //mm0=a
  movq_m2r(*(In+xStride+xStride),mm2); //mm2=c
  pavgb_r2r(mm0,mm2);          //mm2=(a+c)/2
  movq_m2r(*(In+xStride),mm3);           //mm3=b
  pavgb_r2r(mm2,mm3);          //mm3=(b+(a+c)/2)/2=(a+2*b+c)/4
  movq_r2m(mm3,*Out);  
}
static inline void Msse2_121ySmooth_pavgb1(const unsigned char *In,unsigned char *Out,int xStride)
{
  movdqa_m2r(*In,xmm0);                    //xmm0=a
  movdqa_m2r(*(In+xStride+xStride),xmm2);  //xmm2=c
  pavgb_r2r(xmm0,xmm2);                    //xmm2=(a+c)/2
  movdqa_m2r(*(In+xStride),xmm3);          //xmm3=b
  pavgb_r2r(xmm2,xmm3);                    //xmm3=(b+(a+c)/2)/2=(a+2*b+c)/4
  movdqa_r2m(xmm3,*Out);
}
static inline void Mmmx121ySmooth_pavgb(const unsigned char* In, int xStrideIn,
			    int x, int y, unsigned char* Out, int xStrideOut)
{
  for (int xx= 0; xx<x; xx+=8)
  {
    const unsigned char* PteIn= In+xx;
    unsigned char* PteOut= Out+xx;
    for (int yy= 0; yy<y-1; yy++) {
      Mmmx121ySmooth_pavgb1(PteIn, PteOut,xStrideIn);
      PteIn+=  xStrideIn;
      PteOut+= xStrideOut;
    };
  };
}
static inline void Msse2_121ySmooth_pavgb(const unsigned char* In, int xStrideIn,
			    int x, int y, unsigned char* Out, int xStrideOut)
{
  for (int xx= 0; xx<x; xx+=16)
  {
    const unsigned char* PteIn= In+xx;
    unsigned char* PteOut= Out+xx;
    for (int yy= 0; yy<y-1; yy++) {
      Msse2_121ySmooth_pavgb1(PteIn, PteOut,xStrideIn);
      PteIn+=  xStrideIn;
      PteOut+= xStrideOut;
    };
  };
}

static inline void Mmmx121xSmooth_1(const unsigned char* In) {
  static mmx_t mmx_2222= {0x0002000200020002ull};
  movq_m2r      (mmx_2222,mm7);
  pxor_r2r      (mm0,mm0);  // mm0= 0
  pxor_r2r      (mm1,mm1);  // mm1= 0

  movq_m2r      (*In,mm2); // mm2= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r      (mm2,mm3);
  punpcklbw_r2r (mm0,mm2);
  punpckhbw_r2r (mm0,mm3);
}
static inline void Mmmx121xSmooth_2(const unsigned char* In,
				    unsigned char* Out) {
  static mmx_t mmx_0fff= {0x0000ffffffffffffull}, mmx_fff0= {0xffffffffffff0000ull},
	       mmx_f000= {0xffff000000000000ull}, mmx_000f= {0x000000000000ffffull};
  // mm3= i7 i6 i5 i4   mm2= i3 i2 i1 i0   mm1= 00 00 00 j7

  pshufw_r2r    (mm2,mm4,0x39); // mm4= i0 i3 i2 i1
  pshufw_r2r    (mm3,mm5,0x39); // mm5= i4 i7 i6 i5
  movq_r2r      (mm5,mm6);
  pand_m2r      (mmx_0fff,mm4);
  pand_m2r      (mmx_f000,mm6);
  pand_m2r      (mmx_0fff,mm5);
  por_r2r       (mm6,mm4);
  // mm5= 00 i7 i6 i5   mm4= i4 i3 i2 i1
  paddw_r2r     (mm1,mm4); // mm4= i4 i3 i2 i1+j7 (j7 is the value on the left)

  pshufw_r2r    (mm2,mm6,0x93); // mm6= i2 i1 i0 i3
  pand_m2r      (mmx_fff0,mm6);
  paddw_r2r     (mm6,mm4);
  pshufw_r2r    (mm3,mm6,0x93); // mm6= i6 i5 i4 i7
  pand_m2r      (mmx_fff0,mm6);
  paddw_r2r     (mm6,mm5);
  pshufw_r2r    (mm2,mm6,0x93); // mm6= i2 i1 i0 i3
  pand_m2r      (mmx_000f,mm6);
  paddw_r2r     (mm6,mm5);
  // this was the second solution without using mm7

  paddw_r2r     (mm2,mm4);
  paddw_r2r     (mm3,mm5);
  paddw_r2r     (mm2,mm4);
  paddw_r2r     (mm3,mm5);
  // mm4.w= mm6.w+mm4.w+mm2.w+mm2.w
  // mm5.w= mm7.w+mm5.w+mm3.w+mm3.w

  movq_r2r      (mm3,mm1);
  pshufw_r2r    (mm1,mm1,0xff);
  pand_m2r      (mmx_000f,mm1);
  // mm1= 00 00 00 j7

  movq_m2r      (*In,mm2); // mm2= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r      (mm2,mm3);
  punpcklbw_r2r (mm0,mm2);
  punpckhbw_r2r (mm0,mm3);

  movq_r2r      (mm2,mm6); // mm6= ** ** ** j0
  pshufw_r2r    (mm6,mm6,0x0);
  pand_m2r      (mmx_f000,mm6);
  // mm6= j0 00 00 00
  paddw_r2r     (mm6,mm5);

  paddw_r2r     (mm7,mm4);
  paddw_r2r     (mm7,mm5);
  psrlw_i2r     (2,mm4);
  psrlw_i2r     (2,mm5);
  packuswb_r2r  (mm5,mm4);
  #ifdef __WITH_MOVNTQ__
  movntq_r2m    (mm4,*Out);
  #else
  movq_r2m    (mm4,*Out);
  #endif
}

static inline void Mmmx121xSmooth(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {
  for (int yy= y; yy; yy--) {
    const unsigned char* PteIn= In+xStrideIn*(yy-1);
    Mmmx121xSmooth_1(PteIn);
    PteIn+= 8;
    unsigned char* PteOut= Out+xStrideOut*(yy-1);
    for (int xx= x/8; xx; xx--) {
      Mmmx121xSmooth_2(PteIn, PteOut);
      PteIn+= 8;
      PteOut+= 8;
    };
  };
}

static inline void Mmmx121xSmooth_pavgb1(const unsigned char* In,unsigned char *Out,int xStrideIn)
{
  static mmx_t mmx0=      {0x0000000000000000ull};
  static mmx_t mmx255=    {0xFFFFFFFFFFFFFFFFull};
  static mmx_t mmx128=    {0x7F7F7F7F7F7F7F7Full};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};

  movq_m2r(mmx0,mm6);

  movq_m2r(*In,mm0);             // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm5);             // mm5= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm7);             // mm7= i7 i6 i5 i4 i3 i2 i1 i0
  
  //Decalage de mm0 vers la droite
  movq_r2r(mm0,mm1);
  punpckhbw_r2r(mm6,mm0);        // mm0= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm1);        // mm1= 00 i3 00 i2 00 i1 00 i0
  pshufw_r2r(mm0,mm0,0x93);      // mm0= 00 i6 00 i5 00 i4 00 i7
  pshufw_r2r(mm1,mm1,0x93);      // mm1= 00 i2 00 i1 00 i0 00 i3
  movq_r2r(mm1,mm2);
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 i3
  pand_m2r(mmx_FFF0,mm0);        // mm0= 00 i6 00 i5 00 i4 00 00
  paddusw_r2r(mm2,mm0);          // mm0= 00 i6 00 i5 00 i4 00 i3  Ok pour mm0
  psubusw_r2r(mm2,mm1);          // mm1= 00 i2 00 i1 00 i0 00 00
  movq_m2r(*(In-8),mm2);         // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  punpckhbw_r2r(mm6,mm2);        // mm2= 00 j7 00 j6 00 j5 00 j4
  pshufw_r2r(mm2,mm2,0x93);      // mm2= 00 j6 00 j5 00 j4 00 j7
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 j7
  paddusw_r2r(mm2,mm1);          // mm1= 00 i2 00 i1 00 i0 00 j7  Ok pour mm1
  packuswb_r2r(mm0,mm1);         // mm1= i6 i5 i4 i3 i2 i1 i0 j7

  //decalage de mm5 vers la gauche
  //mm1 ne peut plus etre utilise, les autres registres sont libres
  movq_r2r(mm5,mm0);
  punpckhbw_r2r(mm6,mm5);        // mm5= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm0);        // mm0= 00 i3 00 i2 00 i1 00 i0
  pshufw_r2r(mm5,mm5,0x39);      // mm5= 00 i4 00 i7 00 i6 00 i5
  pshufw_r2r(mm0,mm0,0x39);      // mm0= 00 i0 00 i3 00 i2 00 i1
  movq_r2r(mm5,mm4);             // mm4= 00 i4 00 i7 00 i6 00 i5
  pand_m2r(mmx_F000,mm4);        // mm4= 00 i4 00 00 00 00 00 00
  pand_m2r(mmx_0FFF,mm0);        // mm0= 00 00 00 i3 00 i2 00 i1
  paddusw_r2r(mm4,mm0);          // mm0= 00 i4 00 i3 00 i2 00 i1  Ok pour mm0
  psubusw_r2r(mm4,mm5);          // mm5= 00 00 00 i7 00 i6 00 i5
  movq_m2r(*(In+8),mm2);         // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  punpcklbw_r2r(mm6,mm2);        // mm2= 00 j3 00 j2 00 j1 00 j0
  pshufw_r2r(mm2,mm2,0x39);      // mm2= 00 j0 00 j3 00 j2 00 j1
  pand_m2r(mmx_F000,mm2);        // mm2= 00 j0 00 00 00 00 00 00
  paddusw_r2r(mm2,mm5);          // mm5= 00 j0 00 i7 00 i6 00 i5  Ok pour mm5
  packuswb_r2r(mm5,mm0);         // mm0= j0 i7 i6 i5 i4 i3 i2 i1

  //A ce stade on doit calculer 1/4*(a+2*b+c) avec mm1=a, mm0=c, mm7=b
  pavgb_r2r(mm0,mm1);   //mm1=(a+c)/2
  pavgb_r2r(mm1,mm7);   //mm7=( (a+c)/2 +b )/2
  
  movq_r2m(mm7,*Out);  
}

static inline void Mmmx121xSmooth_pavgb(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut)
{
  for (int yy= 0; yy<y; yy++)
  {
    const unsigned char* PteIn= In+yy*xStrideIn;
    unsigned char* PteOut= Out+yy*xStrideOut;
    //filtrage du premier octet
    PteOut[0]=PteIn[0];
    for (int k=1;k<8;k++) PteOut[k]=(PteIn[k-1]+PteIn[k+1]+2*PteIn[k])/4;
    PteIn+=8;
    PteOut+=8;
    for (int xx= 8; xx<x-8; xx+=8)
    {
      Mmmx121xSmooth_pavgb1(PteIn, PteOut,xStrideIn);
      PteIn+=  8;
      PteOut+= 8;
    };
    //filtrage du dernier octet
    for (int k=0;k<7;k++) PteOut[k]=(PteIn[k-1]+PteIn[k+1]+2*PteIn[k])/4;
    PteOut[7]=PteIn[7];
    
  };
}
//SSE2
static inline void Msse2_121xSmooth_pavgb1(const unsigned char* In,unsigned char *Out,int xStrideIn)
{
  static sse_t sse0=      {0x00000000000000000000000000000000ull};

  movdqa_m2r(sse0,xmm6);

  movdqa_m2r(*In,xmm0);             // xmm0= i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0
  movdqa_r2r(xmm0,xmm1);
  movdqa_r2r(xmm0,xmm5);
  
  
  //Decalage de xmm1 vers la gauche
  pslldq_i2r(1,xmm1);               // xmm1= i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0 00
	movdqa_m2r(*(In-16),xmm2);        // xmm2= j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0
	psrldq_i2r(15,xmm2);							// xmm2= 00  00  00  00  00  00  00 00 00 00 00 00 00 00 00 j15
	paddb_r2r(xmm2,xmm1);							// xmm1= i14 i13 i12 i11 10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0 j15

  //decalage de xmm5 vers la droite
  psrldq_i2r(1,xmm5);               // xmm5= 00  i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1
	movdqa_m2r(*(In+16),xmm2);        // xmm2= j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0
	pslldq_i2r(15,xmm2);							// xmm2= j0 00  00  00  00  00  00  00 00 00 00 00 00 00 00 00
	paddb_r2r(xmm2,xmm5);							// xmm5= j0 i15 i14 i13 i12 i11 10 i9 i8 i7 i6 i5 i4 i3 i2 i1

  //A ce stade on doit calculer 1/4*(a+2*b+c) avec xmm1=a, xmm0=b, xmm5=c
  pavgb_r2r(xmm5,xmm1);   //xmm1=(a+c)/2
  pavgb_r2r(xmm1,xmm0);   //xmm7=( (a+c)/2 +b )/2

  movdqa_r2m(xmm0,*Out);
}
static inline void Msse2_121xSmooth_pavgb(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut)
{
  for (int yy= 0; yy<y; yy++)
  {
    const unsigned char* PteIn= In+yy*xStrideIn;
    unsigned char* PteOut= Out+yy*xStrideOut;
    //filtrage du premier paquet de 16 octets
    PteOut[0]=PteIn[0];
    for (int k=1;k<16;k++) PteOut[k]=(PteIn[k-1]+PteIn[k+1]+2*PteIn[k])/4;
    PteIn+=16;
    PteOut+=16;
    for (int xx= 16; xx<x-16; xx+=16)
    {
      Msse2_121xSmooth_pavgb1(PteIn, PteOut,xStrideIn);
      PteIn+=  16;
      PteOut+= 16;
    };
    //filtrage du dernier paquet de 16 octets
    for (int k=0;k<15;k++) PteOut[k]=(PteIn[k-1]+PteIn[k+1]+2*PteIn[k])/4;
    PteOut[15]=PteIn[15];

  };
}

#else
static inline void Mmmx121ySmooth_pavgb(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
static inline void Msse2_121ySmooth_pavgb(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
static inline void Mmmx121xSmooth(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
static inline void Msse2_121xSmooth_pavgb(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
#endif

void MCharImage::Smooth121(int IsHorizontal0Vertical1, const MCharImage&Data) {
  SetSize(Data.x__, Data.y__);
  if (IsHorizontal0Vertical1==1) {
    if (MWithMmxSseUse()==2) {
      Msse2_121ySmooth_pavgb(Data.yuv, x__, x__, y__, yuv, x__);
      /*//verif
        MCharImage Verif(x__,y__);
				Mmmx121ySmooth_pavgb(Data.yuv, x__, x__, y__, Verif.yuv, x__);
				Memms(); // now FPU can be used
        //cout << "verif" << endl;
				for (int i=0;i<x__;i++)
				{
					for (int j=0;j<y__-1;j++)
					{
						if (yuv[i+j*x__]!=Verif.yuv[i+j*x__]) cout << int(yuv[i+j*x__]) << " = " << int(Verif.yuv[i+j*x__]) << " (" << i << "," << j << ")" << endl;
					}
				}
				//cout << "finverif" << endl;*/
				      
    }
    else
    {
			if (MWithMmxSseUse()==1)
			{
				Mmmx121ySmooth_pavgb(Data.yuv, x__, x__, y__, yuv, x__);
				Memms(); // now FPU can be used
			}
			else
			{
				for (int xx= 0; xx< x__; xx++)
				{
					unsigned char *PteIn= Data.yuv+xx, *PteOut= yuv+xx+x__;
					int a= PteIn[0], b= PteIn[x__];
					PteIn+= (x__+x__);
					for (int yy= y__-2; yy; yy--) {
						int c= *PteIn;
						*PteOut= (unsigned char)((a+c+b+b+2)>>2);
						PteIn+=  x__;
						PteOut+= x__;
						a= b; b= c;
					};
				};
			}
		}
	} //Fin lissage y
  else
  {
		//Si lissage en x
    if (MWithMmxSseUse()==2)
    {
			Msse2_121xSmooth_pavgb(Data.yuv, x__, x__, y__, yuv, x__);
		}
		else
		{
			if (MWithMmxSseUse()==1)
			{
				Mmmx121xSmooth_pavgb(Data.yuv, x__, x__, y__, yuv, x__);
				Memms(); // now FPU can be used
			}
			else
			{
				for (int yy= 0; yy< y__; yy++)
				{
					unsigned char *PteIn= Data.yuv+yy*x__+1, *PteOut= yuv+yy*x__;
					int a= 0, b= PteIn[-1];
					for (int xx= 0; xx< x__; xx++) {
						int c= PteIn[xx];
						PteOut[xx]= (unsigned char)((a+c+b+b+2)>>2);
						a= b; b= c;
					};
				};      

			}  
		};
	}
}


// ****************************** Laplacian1_21 *******************************

#ifdef ARCH_X86
static inline void Mmmx121yLaplacian_1(const unsigned char* In, int xStride) {
  static mmx_t mmx_128= {0x0080008000800080ull};
  movq_m2r      (mmx_128,mm7);
  pxor_r2r      (mm0,mm0);  // mm0= 0
  movq_m2r      (*In,mm1);  // mm1= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r      (mm1,mm2);
  punpcklbw_r2r (mm0,mm1);  // mm1= 00 i3 00 i2 00 i1 00 i0
  punpckhbw_r2r (mm0,mm2);  // mm2= 00 i7 00 i6 00 i5 00 i4

  movq_m2r      (*(In+xStride),mm3);
  movq_r2r      (mm3,mm4);
  punpcklbw_r2r (mm0,mm3);  // mm3= 00 j3 00 j2 00 j1 00 j0
  punpckhbw_r2r (mm0,mm4);  // mm4= 00 j7 00 j6 00 j5 00 j4
}
static inline void Mmmx121yLaplacian_2(const unsigned char* xLaplacianIn,
				 const unsigned char* In, unsigned char* Out) {
  // add the allready calculated x-laplacian in mm1 and mm2
  movq_m2r      (*xLaplacianIn,mm5);
  movq_r2r      (mm5,mm6);
  punpcklbw_r2r (mm0,mm5);  // mm5= 00 lx3+128 00 lx2+128 00 lx1+128 00 lx0+128
  punpckhbw_r2r (mm0,mm6);  // mm6= 00 lx7+128 00 lx6+128 00 lx5+128 00 lx4+128
  psubw_r2r     (mm7,mm5);  // mm5= lx3 lx2 lx1 lx0
  psubw_r2r     (mm7,mm6);  // mm6= lx7 lx6 lx5 lx4    ( -128 < lxi < 128)
  paddw_r2r     (mm5,mm1);
  paddw_r2r     (mm6,mm2);

  // calculate and add the y-laplacian in mm1 and mm2 to obtain xy-laplacian
  movq_m2r      (*In,mm5);  // (*(Out+xStride),mm5);
  movq_r2r      (mm5,mm6);
  punpcklbw_r2r (mm0,mm5);  // mm5= 00 k3 00 k2 00 k1 00 k0
  punpckhbw_r2r (mm0,mm6);  // mm6= 00 k7 00 k6 00 k5 00 k4
  paddw_r2r     (mm5,mm1);  // mm1= .. ki+ii+xlaplacian(xi) .. i= 3..0
  paddw_r2r     (mm6,mm2);  // mm2= .. ki+ii+xlaplacian(xi) .. i= 7..4

  psubw_r2r     (mm3,mm1);
  psubw_r2r     (mm4,mm2);
  psubw_r2r     (mm3,mm1);  // mm1= xlaplacian(xi)+ylaplacian(xi) .. i= 3..0
  psubw_r2r     (mm4,mm2);  // mm2= xlaplacian(xi)+ylaplacian(xi) .. i= 7..4
  // WARNING: -256<xlaplacian(xi)+ylaplacian(xi)<256 => xy-laplacian is satured
  // psraw_i2r     (1,mm1);   // is not mm1= mm1/2 if mm1< 0 ( -1/2= -1 or 0)
  // psraw_i2r     (1,mm2);
  paddw_r2r     (mm7,mm1);
  paddw_r2r     (mm7,mm2);
  packuswb_r2r  (mm2,mm1); // yes, xy-laplacian will be satured like this:
  // mm1= max(0, min(255, xlaplacian(xi)+ylaplacian(xi)+128)), i= 7..0
  #ifdef __WITH_MOVNTQ__
  movntq_r2m    (mm1,*Out);
  #else
  movq_r2m    (mm1,*Out);
  #endif
  movq_r2r      (mm3,mm1);
  movq_r2r      (mm4,mm2);
  movq_r2r      (mm5,mm3);
  movq_r2r      (mm6,mm4);
}
static inline void Mmmx121yLaplacian(const unsigned char* xLaplacianIn,
			  const unsigned char* In, int xStrideIn, int x, int y,
				     unsigned char* Out, int xStrideOut) {
  for (int xx= x/8; xx; xx--) {
    const unsigned char* PteIn= In+8*(xx-1);
    Mmmx121yLaplacian_1(PteIn, xStrideIn);
    PteIn+= (xStrideIn+xStrideIn);
    const unsigned char* PtexLaplacianIn= xLaplacianIn+8*(xx-1)+xStrideIn;
    unsigned char* PteOut= Out+8*(xx-1)+xStrideOut;
    for (int yy= y-2; yy; yy--) {
      Mmmx121yLaplacian_2(PtexLaplacianIn, PteIn, PteOut);
      PtexLaplacianIn+= xStrideIn;
      PteIn+=           xStrideIn;
      PteOut+=          xStrideOut;
    };
  };
}

//pour le laplacien en x, s inspirer de Msse121xLaplacian, sinon Mmmx121xSmooth
static inline void Mmmx121xLaplacian_1(const unsigned char* In) {
  static mmx_t mmx_128= {0x0080008000800080ull};
  movq_m2r      (mmx_128,mm0);
  pxor_r2r      (mm2,mm2); // mm2= 0
  pxor_r2r      (mm3,mm3); // mm3= 0
  movq_m2r      (*In,mm5); // mm5= j7 j6 j5 j4 j3 j2 j1 j0
}
static inline void Mmmx121xLaplacian_2(const unsigned char* In,
				       unsigned char* Out) {
  // mm0= 80 80 80 80,         mm2= 0,              mm1 and mm4 are unused
  // mm3= i7 i6 i5 i4 i3 i2 i1 i0,  mm5= j7 j6 j5 j4 j3 j2 j1 j0
  psrlq_i2r     (56,mm3);   // mm3= 0 0 0 0 0 0 0 i7

  movq_r2r      (mm5,mm4);
  psllq_i2r     (8,mm4);    // mm4= j6 j5 j4 j3 j2 j1 j0 0
  por_r2r       (mm4,mm3);  // mm3= j6 j5 j4 j3 j2 j1 j0 i7
  movq_r2r      (mm3,mm4);
  punpcklbw_r2r (mm2,mm3);  // mm3= j2 j1 j0 i7
  punpckhbw_r2r (mm2,mm4);  // mm4= j6 j5 j4 j3

  movq_r2r      (mm5,mm6);
  punpcklbw_r2r (mm2,mm6);  // mm6= j3 j2 j1 j0
  psubw_r2r     (mm6,mm3);
  psubw_r2r     (mm6,mm3);  // mm3= j2-2*j3 j1-2*j2 j0-2*j1 i7-2*j0

  movq_r2r      (mm5,mm6);
  punpckhbw_r2r (mm2,mm6);  // mm6= j7 j6 j5 j4
  psubw_r2r     (mm6,mm4);
  psubw_r2r     (mm6,mm4);  // mm4= j6-2*j7 j5-2*j6 j4-2*j5 j3-2*j4

  movq_m2r      (*In,mm1);  // mm1= k7 k6 k5 k4 k3 k2 k1 k0

  movq_r2r      (mm1,mm7);
  psllq_i2r     (56,mm7);   // mm7= k0 0 0 0 0 0 0 0
  movq_r2r      (mm5,mm6);
  psrlq_i2r     (8,mm6);    // mm6=  0 j7 j6 j5 j4 j3 j2 j1
  por_r2r       (mm7,mm6);  // mm6= k0 j7 j6 j5 j4 j3 j2 j1

  movq_r2r      (mm6,mm7);
  punpcklbw_r2r (mm2,mm7);  // mm7= j4 j3 j2 j1
  paddw_r2r     (mm7,mm3);  // mm3= j2-2*j3+j4 j1-2*j2+j3 j0-2*j1+j2 i7-2*j0+j1

  punpckhbw_r2r (mm2,mm6);  // mm6= k0 j7 j6 j5
  paddw_r2r     (mm6,mm4);  // mm4= j6-2*j7+k0 j5-2*j6+j7 j4-2*j5+j6 j3-2*j4+j5

  paddw_r2r     (mm0,mm3);
  paddw_r2r     (mm0,mm4);
  packuswb_r2r  (mm4,mm3);
  // mm3= j6-2*j7+k0+128 .. j3-2*j4+j5+128 .. i7-2*j0+j1+128
  #ifdef __WITH_MOVNTQ__
  movntq_r2m    (mm3,*Out);
  #else
  movq_r2m    (mm3,*Out);
  #endif
  
  movq_r2r      (mm5,mm3);
  movq_r2r      (mm1,mm5);
}

static inline void Mmmx121xLaplacian(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {
  for (int yy= y; yy; yy--) {
    const unsigned char* PteIn= In+xStrideIn*(yy-1);
    Mmmx121xLaplacian_1(PteIn);
    PteIn+= 8;
    unsigned char* PteOut= Out+xStrideOut*(yy-1);
    for (int xx= x/8; xx; xx--) {
      Mmmx121xLaplacian_2(PteIn, PteOut);
      PteIn+= 8;
      PteOut+= 8;
    };
  };
}
#else
static inline void Mmmx121yLaplacian(const unsigned char* xLaplacianIn,
			  const unsigned char* In, int xStrideIn, int x, int y,
				     unsigned char* Out, int xStrideOut) {}
static inline void Mmmx121xLaplacian(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
#endif

void MCharImage::Laplacian1_21_128(const MCharImage& Data) {
  assert(&Data!= this);
  SetSize(Data.x__, Data.y__);
  if (MWithMmxSseUse()) {
    Mmmx121xLaplacian(Data.yuv, x__, x__, y__, yuv, x__);
    Mmmx121yLaplacian(yuv, Data.yuv, x__, x__, y__, yuv, x__);
    Memms(); // now FPU can be used
  } else
    for (int i= x__+1, iMax= x__*(y__-1); i< iMax; i++) {
      unsigned char *PteIn= Data.yuv+i;
      yuv[i]= (unsigned char) Max(0, Min(255,
		  (128+PteIn[-1]+PteIn[1]+PteIn[x__]+PteIn[-x__]-4*PteIn[0])));
    };
}

// ************************* AbsAndMultiplyLaplacian **************************

#ifdef ARCH_X86
static inline void MmmxAbsAndMultiplyLaplacian_1(int* Coef) {
  static mmx_t mmx_128= {0x0080008000800080ull};
  pxor_r2r      (mm0,mm0);
  movq_m2r      (mmx_128,mm1);
  movd_m2r      (*Coef,mm6);
  pshufw_r2r    (mm6,mm6,0); // mm6= coef coef coef
}
static inline void MmmxAbsAndMultiplyLaplacian_2(unsigned char* Out) {
  movq_m2r      (*Out,mm2);
  movq_r2r      (mm2,mm3);
  punpcklbw_r2r (mm0,mm2);
  punpckhbw_r2r (mm0,mm3);
  psubw_r2r     (mm1,mm2); // mm2= i3-128  i2-128  i1-128  i0-128
  psubw_r2r     (mm1,mm3); // mm3= i7-128  i6-128  i5-128  i4-128
  pxor_r2r      (mm4,mm4);
  pxor_r2r      (mm5,mm5);
  psubw_r2r     (mm2,mm4);
  psubw_r2r     (mm3,mm5);
  pmaxsw_r2r    (mm4,mm2);
  pmaxsw_r2r    (mm5,mm3);
  pmullw_r2r    (mm6,mm2);
  pmullw_r2r    (mm6,mm3);
  // mm2= mm6*|i3-128| mm6*|i2-128| mm6*|i1-128| mm6*|i0-128|
  // mm3= mm6*|i7-128| mm6*|i6-128| mm6*|i5-128| mm6*|i4-128|

  packuswb_r2r  (mm3,mm2); // mm7= .. max(0, min(255, mm6*|ii-128|) ..
  #ifdef __WITH_MOVNTQ__
  movntq_r2m    (mm2,*Out);
  #else
  movq_r2m    (mm2,*Out);
  #endif
}
#else
static inline void MmmxAbsAndMultiplyLaplacian_1() {}
static inline void MmmxAbsAndMultiplyLaplacian_2(unsigned char* Out) {}
#endif

void MCharImage::AbsAndMultiplyLaplacian_128(int Coef) {
  if (MWithMmxSseUse()) {
    unsigned char* Out= yuv;
    MmmxAbsAndMultiplyLaplacian_1(&Coef);
    for (int i= x__*y__/8; i; i--) {
      MmmxAbsAndMultiplyLaplacian_2(Out);
      Out+= 8;
    };
    Memms();
  } else
    for (int i= x__*y__-1; i; i--)
      yuv[i-1]= (unsigned char)Max(0, Min(255, Abs(Coef*(yuv[i-1]-128))));
}

// *************************** Zncc 5x5 using pmaddwd *************************

#ifdef ARCH_X86
static inline void MmmxZncc5x5_1a() {
  static mmx_t mmx_000fffff = {0x000000ffffffffffull},
                  mmx_8888={0x0080008000800080ull}, mmx_0008={0x0000000000000080ull};
  movq_m2r    (mmx_000fffff,mm0); // mm0= 00 00 00 ff ff ff ff ff
  movq_m2r    (mmx_8888,mm1);     // mm1= 80 80 80 80
  movq_m2r    (mmx_0008,mm2);     // mm2= 0  0  0  80
  pxor_r2r    (mm3,mm3);          // mm3= 0
  xorps_r2r   (xmm7,xmm7);
}
static inline void MmmxZncc5x5_1b() {
  xorps_r2r   (xmm1,xmm1);
  xorps_r2r   (xmm2,xmm2);
  xorps_r2r   (xmm3,xmm3);
  xorps_r2r   (xmm4,xmm4);
  xorps_r2r   (xmm5,xmm5);
}
static inline void MmmxZncc5x5_2a(const unsigned char* In0,
				  const unsigned char* In1) {
  xorps_r2r     (xmm0,xmm0);
  movq_m2r      (*In0,mm4);
  movq_m2r      (*In1,mm5);
  pand_r2r      (mm0,mm4); // mm4= 0 0 0 i4+128 i3+128 i2+128 i1+128 i0+128
  pand_r2r      (mm0,mm5); // mm5= 0 0 0 j4+128 j3+128 j2+128 j1+128 j0+128

  // horizontal square sum of mm4
  movq_r2r      (mm4,mm6);
  punpcklbw_r2r (mm3,mm6);
  psubw_r2r     (mm1,mm6); // mm6= i3 i2 i1 i0
  pmaddwd_r2r   (mm6,mm6); // mm6= i3*i3+i2*i2 i1*i1+i0*i0
  pshufw_r2r    (mm6,mm7,0x4e);
  paddd_r2r     (mm6,mm7); // mm7= 2* i3*i3+i2*i2+i1*i1+i0*i0

  movq_r2r      (mm4,mm6);
  punpckhbw_r2r (mm3,mm6);
  psubw_r2r     (mm2,mm6); // mm6= 0 0 0 i4
  pmaddwd_r2r   (mm6,mm6); // mm6= 0+0 0+i4*i4
  paddd_r2r     (mm6,mm7); // mm7= * i4*i4+i3*i3+i2*i2+i1*i1+i0*i0
  cvtpi2ps_r2r  (mm7,xmm0);
  shufps_r2r    (xmm0,xmm0,0x3f); // 3f= 00 11 11 11
  // xmm0= i4*i4+i3*i3+i2*i2+i1*i1+i0*i0 0 0 0

  // horizontal square sum of mm5
  movq_r2r      (mm5,mm6);
  punpcklbw_r2r (mm3,mm6);
  psubw_r2r     (mm1,mm6); // mm6= j3 j2 j1 j0
  pmaddwd_r2r   (mm6,mm6); // mm6= j3*j3+j2*j2 j1*j1+j0*j0
  pshufw_r2r    (mm6,mm7,0x4e);
  paddd_r2r     (mm6,mm7); // mm7= 2* j3*j3+j2*j2+j1*j1+j0*j0

  movq_r2r      (mm5,mm6);
  punpckhbw_r2r (mm3,mm6);
  psubw_r2r     (mm2,mm6); // mm6= 0 0 0 j4
  pmaddwd_r2r   (mm6,mm6); // mm6= 0+0 0+j4*j4
  paddd_r2r     (mm6,mm7); // mm7= * j4*j4+j3*j3+j2*j2+j1*j1+j0*j0
  cvtpi2ps_r2r  (mm7,xmm0);
  shufps_r2r    (xmm0,xmm0,0xca); // ca= 11 00 10 10
  // xmm0= i4*i4+i3*i3+i2*i2+i1*i1+i0*i0 j4*j4+j3*j3+j2*j2+j1*j1+j0*j0 0 0

  // horizontal product sum of mm4 and mm5
  movq_r2r      (mm4,mm6);
  movq_r2r      (mm5,mm7);
  punpcklbw_r2r (mm3,mm6);
  punpcklbw_r2r (mm3,mm7);
  psubw_r2r     (mm1,mm6); // mm6= i3 i2 i1 i0
  psubw_r2r     (mm1,mm7); // mm7= j3 j2 j1 j0
  pmaddwd_r2r   (mm6,mm7); // mm6= i3*j3+i2*j2 i1*j1+i0*j0
  pshufw_r2r    (mm7,mm6,0x4e);
  paddd_r2r     (mm6,mm7); // mm7= 2* i3*j3+i2*j2+i1*j1+i0*j0

  punpckhbw_r2r (mm3,mm4);
  punpckhbw_r2r (mm3,mm5);
  psubw_r2r     (mm2,mm4); // mm4= 0 0 0 i4
  psubw_r2r     (mm2,mm5); // mm5= 0 0 0 j4
  pmaddwd_r2r   (mm4,mm5); // mm5= 0+0 0+i4*j4
  // pshufw_r2r    (mm5,mm4,0x4e); // usefull only for 7x7 zncc
  // paddd_r2r     (mm4,mm5); // mm5= 2* 0+0+0+i4*j4

  paddd_r2r     (mm5,mm7);
  cvtpi2ps_r2r  (mm7,xmm0);
  // xmm0= i4*i4+i3*i3+i2*i2+i1*i1+i0*i0 j4*j4+j3*j3+j2*j2+j1*j1+j0*j0
  //       *                             i4*j4+i3*j3+i2*j2+i1*j1+i0*j0, ( *> 0)

  // vertical sum of horizontal square sum and horizontal product sum
  addps_r2r   (xmm0,xmm7);
}
static inline void MmmxZncc5x5_2b() {
  subps_r2r   (xmm5,xmm7);

  movaps_r2r  (xmm4,xmm5);
  movaps_r2r  (xmm3,xmm4);
  movaps_r2r  (xmm2,xmm3);
  movaps_r2r  (xmm1,xmm2);
  movaps_r2r  (xmm0,xmm1);
}

static inline void MmmxZncc5x5_3(int* Out) {
  static sse_t sse_65535 = { 65535, 65535, 65535, 65535 };
  // output (now xmm0 is not yet used)
  movaps_r2r  (xmm7,xmm6);
  shufps_r2r  (xmm6,xmm6,0xee); // xmm6= 2*( iSumQuare jSumQquare )
  xorps_r2r   (xmm0,xmm0);
  cmpneqps_r2r (xmm6,xmm0); // xmm0= 2*( (iSumQuare? -1:0) (jSumQuare? -1:0) )
  rsqrtps_r2r (xmm6,xmm6);
  andps_r2r   (xmm0,xmm6);  // xmm6= 2*( (iSumSquare)? 1/sqrt(iSumQuare):0 )

  movss_r2r   (xmm7,xmm0);
  mulss_r2r   (xmm6,xmm0);      // xmm0= ijSumProduct/sqrt(jSumQuare)
  shufps_r2r  (xmm6,xmm6,0x01);
  mulss_r2r   (xmm6,xmm0);
  // xmm0= * * * * ijSumProduct/(sqrt(iSumQuare)*sqrt(jSumQuare))

  mulss_m2r    (sse_65535,xmm0);
  cvtps2pi_r2r (xmm0,mm4);
  movd_r2m     (mm4,*Out);
}

#else
static inline void MmmxZncc5x5_1a() {}
static inline void MmmxZncc5x5_1b() {}
static inline void MmmxZncc5x5_2(const unsigned char* In0,
				 const unsigned char* In1);
static inline void MmmxZncc5x5_3(int* Out) {}
#endif

float MCharImage::Zncc5x5(int x0,int y0, int x1,int y1,const MCharImage& Data) const
{
  x0-= 2; y0-= 2;
  x1-= 2; y1-= 2;
  if (WithMmxSseUse)
  {
    MmmxZncc5x5_1a();
    const unsigned char *In0= yuv+y0*x__+x0, *In1= Data.yuv+y1*Data.x__+x1;
    for (int i= 5; i; i--) {
      MmmxZncc5x5_2a(In0, In1);
      In0+= x__; In1+= Data.x__;
    };
    int Result;
    MmmxZncc5x5_3(&Result);
    return Result;
  }
  else
  {
    int SumSquare0= 0, SumSquare1= 0, SumProduct= 0;
    for (int yy= 5; yy; yy--)
    {
      const unsigned char *In0= yuv+(y0+yy-1)*x__+x0-1;
			const unsigned char	*In1= Data.yuv+(y1+yy-1)*Data.x__+x1-1;
      for (int xx= 5; xx; xx--)
      {
				int L0= In0[xx]-128, L1= In1[xx]-128;
				SumSquare0+= L0*L0; SumSquare1+= L1*L1; SumProduct+= L0*L1;
      };
    };
    return ((SumProduct)? int((65535.*float(SumProduct))/
			      sqrt(float(SumSquare0)*float(SumSquare1))): 0);
  };
	    
}

static inline void MZncc5x5(const unsigned char* In0, const unsigned char* In1,
			  int& SumSquare0, int& SumSquare1, int& SumProduct) {
  SumSquare0= 0; SumSquare1= 0; SumProduct= 0;
  for (int xx= 5; xx; xx--) {
    int L0= In0[xx]-128, L1= In1[xx]-128;
    SumSquare0+= L0*L0; SumSquare1+= L1*L1; SumProduct+= L0*L1;
  };
}

void MCharImage::Zncc5x5(int x0,int y0, int x1,int y1, int dy,
			 const MCharImage& Data, int* TableOfdyScore) const {
  x0-= 2; y0-= 2;
  x1-= 2; y1-= 2;
  if (WithMmxSseUse) {
    MmmxZncc5x5_1a();
    MmmxZncc5x5_1b();
    const unsigned char *In0= yuv+y0*x__+x0, *In1= Data.yuv+y1*Data.x__+x1;
    for (int j= 4; j; j--) {
      MmmxZncc5x5_2a(In0, In1);
      MmmxZncc5x5_2b();
      In0+= x__; In1+= Data.x__;
    };
    int* Out= TableOfdyScore;
    for (int i= dy; i; i--) {
      MmmxZncc5x5_2a(In0, In1);
      MmmxZncc5x5_2b();
      MmmxZncc5x5_3(Out);
      In0+= x__; In1+= Data.x__; Out++;
    };
  } else {
    int SumSquare0= 0, SumSquare1= 0, SumProduct= 0, i,
      SumSquare0Table[7], SumSquare1Table[7], SumProductTable[7];
    const unsigned char *In0= yuv+y0*x__+x0-1, *In1= Data.yuv+y1*Data.x__+x1-1;
    for (i= 0; i< 4; i++) {
      MZncc5x5(In0, In1, SumSquare0Table[i], SumSquare1Table[i],
	       SumProductTable[i]);
      SumSquare0+= SumSquare0Table[i];
      SumSquare1+= SumSquare1Table[i];
      SumProduct+= SumProductTable[i];
      In0+= x__; In1+= Data.x__;
    };
    SumSquare0Table[5]= 0; SumSquare1Table[5]= 0; SumProductTable[5]= 0;
    for (int i= 0; i< dy; i++) {
      int iAdd= (i+4)%6, iSub= (i+5)%6;
      MZncc5x5(In0, In1, SumSquare0Table[iAdd], SumSquare1Table[iAdd],
	       SumProductTable[iAdd]);
      SumSquare0+= SumSquare0Table[iAdd]-SumSquare0Table[iSub];
      SumSquare1+= SumSquare1Table[iAdd]-SumSquare1Table[iSub];
      SumProduct+= SumProductTable[iAdd]-SumProductTable[iSub];
      In0+= x__; In1+= Data.x__;
      TableOfdyScore[i]= ((SumProduct)? int((65535.*float(SumProduct))/
				sqrt(float(SumSquare0)*float(SumSquare1))): 0);
    };
  };
}



// ********************************* Mask 5x5  ********************************

#ifdef ARCH_X86
static inline void MmmxMask5x5_1_0() {
 static mmx_t mmx_000fffff={0x000000ffffffffffull}; movq_m2r (mmx_000fffff,mm0); }
static inline void MmmxMask5x5_1_1() {
 static mmx_t mmx_00fffff0={0x0000ffffffffff00ull}; movq_m2r (mmx_00fffff0,mm0); }
static inline void MmmxMask5x5_1_2() {
 static mmx_t mmx_0fffff00={0x00ffffffffff0000ull}; movq_m2r (mmx_0fffff00,mm0); }
static inline void MmmxMask5x5_1_3() {
 static mmx_t mmx_fffff000={0xffffffffff000000ull}; movq_m2r (mmx_fffff000,mm0); }

static inline void MmmxMask5x5_2(const unsigned char* In, unsigned char* Out) {
  movq_m2r   (*In,mm1);
  pand_r2r   (mm0,mm1);
  movq_m2r   (*Out,mm2);
  por_r2r    (mm1,mm2);
  #ifdef __WITH_MOVNTQ__
  movntq_r2m (mm2,*Out);
  #else
  movq_r2m (mm2,*Out);
  #endif
}
static inline void MmmxMask5x5(const unsigned char* In, int xStride,
			       unsigned char* Out, int dx) {
  switch (dx) { case 0: MmmxMask5x5_1_0(); break;
                case 1: MmmxMask5x5_1_1(); break;
                case 2: MmmxMask5x5_1_2(); break;
                case 3: MmmxMask5x5_1_3(); break; };
  MmmxMask5x5_2(In, Out); In+= xStride; Out+= xStride;
  MmmxMask5x5_2(In, Out); In+= xStride; Out+= xStride;
  MmmxMask5x5_2(In, Out); In+= xStride; Out+= xStride;
  MmmxMask5x5_2(In, Out); In+= xStride; Out+= xStride;
  MmmxMask5x5_2(In, Out); In+= xStride; Out+= xStride;
}
#else
static inline void MmmxMask5x5(const unsigned char* In, int xStride,
			       unsigned char* Out, int dx) {}
#endif

inline void MCharImage::Mask5x5(int xx, int yy, const MCharImage& Data) {
  xx-= 2;
  yy-= 2;
  if (WithMmxSseUse) {
    int dx= (xx&3);
    xx-= dx;
    MmmxMask5x5(Data.yuv+yy*x__+xx, x__, yuv+yy*x__+xx, dx);
  } else
    for (int dy= 0; dy< 5; dy++) {
      const unsigned char* In=  Data.yuv+(yy+dy)*x__+xx;
      unsigned char*       Out= yuv+(yy+dy)*x__+xx;
      for (int dx= 0; dx< 5; dx++)
	Out[dx]= In[dx];
    };
}



void MCompare(const MCharImage& Image0, const MCharImage& Image1, int Border) {
  assert(Image0.x()== Image1.x() && Image0.y()== Image1.y());
  int IsSame= 1;
  for (int y= Border; y< Image0.y()-Border; y++)
    for (int x= Border; x< Image1.x()-Border; x++)
      if (Image0.y_(x,y)!= Image1.y_(x,y)) {
	cout << x << " " << y << "= " << int(Image0.y_(x,y)) << " "
	     << int(Image1.y_(x,y)) << " , ";
	IsSame= 0;
      };
  // if (!IsSame)
    cout << endl << endl << flush;
}


//----------------------------------------------------------------------------------------
//  Conversion MShortImage->MCharImage
//----------------------------------------------------------------------------------------
void MCharImage::ConvertFrom(const MShortImage& Image)
{
  SetSize(Image.x(), Image.y());
  const unsigned short *In= &Image.M_(0,0)-1;
  unsigned char *Out= yuv-1;
  for (int i= x__*y__; i; i--)
    {
      yuv[i]= (unsigned char)(In[i]);
    }
  memset(yuv+x__*y__, 128, x__*y__/2);
}



//----------------------------------------------------------------------------------------
//  Gradient_101
//----------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void Mmmx_101yGradient1(const unsigned char *In)
{
  static mmx_t mmx0=  {0x0000000000000000ull};
  movq_m2r(mmx0,mm6);
  movq_m2r(*In,mm0);
  pavgb_r2r(mm6,mm0);
}
static inline void Mmmx_101yGradient2(const unsigned char* In,unsigned char* Out,int xStrideIn)
{
  static mmx_t mmx255={0xFFFFFFFFFFFFFFFFull};
  movq_m2r(mmx255,mm7);
  pavgb_m2r(*(In+xStrideIn),mm7);
  movq_m2r(*(In-xStrideIn),mm1);
  pavgb_r2r(mm6,mm1);
  psubusb_r2r(mm1,mm7);
  #ifdef __WITH_MOVNTQ__
  movntq_r2m(mm7,*Out);
  #else
  movq_r2m(mm7,*Out);
  #endif
}
static inline void Mmmx_101yGradient(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {
  const unsigned char *PteIn;
  unsigned char *PteOut;
  int fin=(y*xStrideIn-xStrideIn);
  Mmmx_101yGradient1(In);
  for (int yy=xStrideIn; yy<fin; yy+=xStrideIn)
    {
      PteIn=In+yy;
      PteOut=Out+yy;
      for (int xx=0; xx<x;xx+=8)
	{
	  Mmmx_101yGradient2(PteIn,PteOut,xStrideIn);
	  PteIn+=8;
	  PteOut+=8;
	}
    }

}

static inline void Mmmx_101xGradient1(const unsigned char* In,unsigned char *Out,int xStrideIn)
{
  static mmx_t mmx0=  {0x0000000000000000ull};
  static mmx_t mmx255=    {0xFFFFFFFFFFFFFFFFull};
  static mmx_t mmx128=    {0x7F7F7F7F7F7F7F7Full};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};

  movq_m2r(mmx0,mm6);

  movq_m2r(*In,mm0);             // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm5);

  //Decalage de mm0 vers la droite
  movq_r2r(mm0,mm1);

  punpckhbw_r2r(mm6,mm0);        // mm0= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm1);        // mm1= 00 i3 00 i2 00 i1 00 i0

  pshufw_r2r(mm0,mm0,0x93);      // mm0= 00 i6 00 i5 00 i4 00 i7
  pshufw_r2r(mm1,mm1,0x93);      // mm1= 00 i2 00 i1 00 i0 00 i3

  movq_r2r(mm1,mm2);
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 i3
  pand_m2r(mmx_FFF0,mm0);        // mm0= 00 i6 00 i5 00 i4 00 00
  paddusw_r2r(mm2,mm0);          // mm0= 00 i6 00 i5 00 i4 00 i3  Ok pour mm0
  psubusw_r2r(mm2,mm1);          // mm1= 00 i2 00 i1 00 i0 00 00
  packuswb_r2r(mm0,mm1);         // mm1= i6 i5 i4 i3 i2 i1 i0 00
  pavgb_r2r(mm6,mm1);  //Division par 2, resultat partiel dans mm1

  //decalage de mm5 vers la gauche
  movq_r2r(mm5,mm0);
  punpckhbw_r2r(mm6,mm5);        // mm5= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm0);        // mm0= 00 i3 00 i2 00 i1 00 i0

  pshufw_r2r(mm5,mm5,0x39);      // mm5= 00 i4 00 i7 00 i6 00 i5
  pshufw_r2r(mm0,mm0,0x39);      // mm0= 00 i0 00 i3 00 i2 00 i1
  movq_r2r(mm5,mm4);             // mm4= 00 i4 00 i7 00 i6 00 i5
  pand_m2r(mmx_F000,mm4);        // mm4= 00 i4 00 00 00 00 00 00
  pand_m2r(mmx_0FFF,mm0);        // mm0= 00 00 00 i3 00 i2 00 i1
  paddusw_r2r(mm4,mm0);          // mm0= 00 i4 00 i3 00 i2 00 i1  Ok pour mm0

  psubusw_r2r(mm4,mm5);          // mm5= 00 00 00 i7 00 i6 00 i5
  movq_m2r(*(In+8),mm2);         // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  punpcklbw_r2r(mm6,mm2);        // mm2= 00 j3 00 j2 00 j1 00 j0
  pshufw_r2r(mm2,mm2,0x39);      // mm2= 00 j0 00 j3 00 j2 00 j1
  pand_m2r(mmx_F000,mm2);        // mm2= 00 j0 00 00 00 00 00 00
  paddusw_r2r(mm2,mm5);          // mm5= 00 j0 00 i7 00 i6 00 i5  Ok pour mm5
  packuswb_r2r(mm5,mm0);         // mm0= j0 i7 i6 i5 i4 i3 i2 i1
  pavgb_r2r(mm6,mm0);  //Division par 2, resultat partiel dans mm0

  movq_m2r(mmx128,mm2);
  psubusb_r2r(mm1,mm2);
  paddusb_r2r(mm0,mm2);

  #ifdef __WITH_MOVNTQ__
  movntq_r2m(mm2,*Out);
  #else
  movq_r2m(mm2,*Out);
  #endif
}
static inline void Mmmx_101xGradient2(const unsigned char* In,unsigned char *Out,int xStrideIn)
{
  static mmx_t mmx255=    {0xFFFFFFFFFFFFFFFFull};
  static mmx_t mmx128=    {0x7F7F7F7F7F7F7F7Full};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_0000=  {0x0000000000000000ull};

  movq_m2r(*In,mm0);             // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm5);

  //Decalage de mm0 vers la droite
  movq_r2r(mm0,mm1);

  punpckhbw_r2r(mm6,mm0);        // mm0= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm1);        // mm1= 00 i3 00 i2 00 i1 00 i0

  pshufw_r2r(mm0,mm0,0x93);      // mm0= 00 i6 00 i5 00 i4 00 i7
  pshufw_r2r(mm1,mm1,0x93);      // mm1= 00 i2 00 i1 00 i0 00 i3

  movq_r2r(mm1,mm2);
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 i3
  pand_m2r(mmx_FFF0,mm0);        // mm0= 00 i6 00 i5 00 i4 00 00
  paddusw_r2r(mm2,mm0);              // mm0= 00 i6 00 i5 00 i4 00 i3  Ok pour mm0

  psubusw_r2r(mm2,mm1);          // mm1= 00 i2 00 i1 00 i0 00 00
  movq_m2r(*(In-8),mm2);         // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  punpckhbw_r2r(mm6,mm2);        // mm2= 00 j7 00 j6 00 j5 00 j4
  pshufw_r2r(mm2,mm2,0x93);      // mm2= 00 j6 00 j5 00 j4 00 j7
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 j7
  paddusw_r2r(mm2,mm1);             // mm1= 00 i2 00 i1 00 i0 00 j7  Ok pour mm1

  packuswb_r2r(mm0,mm1);             // mm1= i6 i5 i4 i3 i2 i1 i0 j7
  pavgb_r2r(mm6,mm1);  //Division par 2

  //decalage de mm5 vers la gauche
  //mm1 ne peut plus etre utilise, les autres registres sont libres
  movq_r2r(mm5,mm0);
  punpckhbw_r2r(mm6,mm5);        // mm5= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm0);        // mm0= 00 i3 00 i2 00 i1 00 i0

  pshufw_r2r(mm5,mm5,0x39);      // mm5= 00 i4 00 i7 00 i6 00 i5
  pshufw_r2r(mm0,mm0,0x39);      // mm0= 00 i0 00 i3 00 i2 00 i1
  movq_r2r(mm5,mm4);             // mm4= 00 i4 00 i7 00 i6 00 i5
  pand_m2r(mmx_F000,mm4);        // mm4= 00 i4 00 00 00 00 00 00
  pand_m2r(mmx_0FFF,mm0);        // mm0= 00 00 00 i3 00 i2 00 i1
  paddusw_r2r(mm4,mm0);          // mm0= 00 i4 00 i3 00 i2 00 i1  Ok pour mm0

  psubusw_r2r(mm4,mm5);          // mm5= 00 00 00 i7 00 i6 00 i5
  movq_m2r(*(In+8),mm2);         // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  punpcklbw_r2r(mm6,mm2);        // mm2= 00 j3 00 j2 00 j1 00 j0
  pshufw_r2r(mm2,mm2,0x39);      // mm2= 00 j0 00 j3 00 j2 00 j1
  pand_m2r(mmx_F000,mm2);        // mm2= 00 j0 00 00 00 00 00 00
  paddusw_r2r(mm2,mm5);          // mm5= 00 j0 00 i7 00 i6 00 i5  Ok pour mm5
  packuswb_r2r(mm5,mm0);         // mm0= j0 i7 i6 i5 i4 i3 i2 i1
  pavgb_r2r(mm6,mm0);  //Division par 2

  movq_m2r(mmx128,mm2);
  psubusb_r2r(mm1,mm2);
  paddusb_r2r(mm0,mm2);

  #ifdef __WITH_MOVNTQ__
  movntq_r2m(mm2,*Out);
  #else
  movq_r2m(mm2,*Out);
  #endif
}
static inline void Mmmx_101xGradient3(const unsigned char* In,unsigned char *Out,int xStrideIn)
{
  static mmx_t mmx255=    {0xFFFFFFFFFFFFFFFFull};
  static mmx_t mmx128=    {0x7F7F7F7F7F7F7F7Full};
  static mmx_t mmx_000F=  {0x000000000000FFFFull};
  static mmx_t mmx_FFF0=  {0xFFFFFFFFFFFF0000ull};
  static mmx_t mmx_F000=  {0xFFFF000000000000ull};
  static mmx_t mmx_0FFF=  {0x0000FFFFFFFFFFFFull};
  static mmx_t mmx_0000=  {0x0000000000000000ull};

  movq_m2r(*In,mm0);             // mm0= i7 i6 i5 i4 i3 i2 i1 i0
  movq_r2r(mm0,mm5);

  //Decalage de mm0 vers la droite
  movq_r2r(mm0,mm1);

  punpckhbw_r2r(mm6,mm0);        // mm0= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm1);        // mm1= 00 i3 00 i2 00 i1 00 i0

  pshufw_r2r(mm0,mm0,0x93);      // mm0= 00 i6 00 i5 00 i4 00 i7
  pshufw_r2r(mm1,mm1,0x93);      // mm1= 00 i2 00 i1 00 i0 00 i3

  movq_r2r(mm1,mm2);
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 i3
  pand_m2r(mmx_FFF0,mm0);        // mm0= 00 i6 00 i5 00 i4 00 00
  paddusw_r2r(mm2,mm0);              // mm0= 00 i6 00 i5 00 i4 00 i3  Ok pour mm0

  psubusw_r2r(mm2,mm1);          // mm1= 00 i2 00 i1 00 i0 00 00
  movq_m2r(*(In-8),mm2);         // mm2= j7 j6 j5 j4 j3 j2 j1 j0
  punpckhbw_r2r(mm6,mm2);        // mm2= 00 j7 00 j6 00 j5 00 j4
  pshufw_r2r(mm2,mm2,0x93);      // mm2= 00 j6 00 j5 00 j4 00 j7
  pand_m2r(mmx_000F,mm2);        // mm2= 00 00 00 00 00 00 00 j7
  paddusw_r2r(mm2,mm1);             // mm1= 00 i2 00 i1 00 i0 00 j7  Ok pour mm1

  packuswb_r2r(mm0,mm1);             // mm1= i6 i5 i4 i3 i2 i1 i0 j7
  pavgb_r2r(mm6,mm1);  //Division par 2

  //decalage de mm5 vers la gauche
  //mm1 ne peut plus etre utilise, les autres registres sont libres
  movq_r2r(mm5,mm0);
  punpckhbw_r2r(mm6,mm5);        // mm5= 00 i7 00 i6 00 i5 00 i4
  punpcklbw_r2r(mm6,mm0);        // mm0= 00 i3 00 i2 00 i1 00 i0

  pshufw_r2r(mm5,mm5,0x39);      // mm5= 00 i4 00 i7 00 i6 00 i5
  pshufw_r2r(mm0,mm0,0x39);      // mm0= 00 i0 00 i3 00 i2 00 i1
  movq_r2r(mm5,mm4);             // mm4= 00 i4 00 i7 00 i6 00 i5
  pand_m2r(mmx_F000,mm4);        // mm4= 00 i4 00 00 00 00 00 00
  pand_m2r(mmx_0FFF,mm0);        // mm0= 00 00 00 i3 00 i2 00 i1
  paddusw_r2r(mm4,mm0);          // mm0= 00 i4 00 i3 00 i2 00 i1  Ok pour mm0

  psubusw_r2r(mm4,mm5);          // mm5= 00 00 00 i7 00 i6 00 i5
  packuswb_r2r(mm5,mm0);         // mm0= 00 i7 i6 i5 i4 i3 i2 i1
  pavgb_r2r(mm6,mm0);  //Division par 2

  movq_m2r(mmx128,mm2);
  psubusb_r2r(mm1,mm2);
  paddusb_r2r(mm0,mm2);

  #ifdef __WITH_MOVNTQ__
  movntq_r2m(mm2,*Out);
  #else
  movq_r2m(mm2,*Out);
  #endif
}
static inline void Mmmx_101xGradient(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut)
{
  const unsigned char *PteIn;
  unsigned char *PteOut;
  int fin=y*xStrideIn;

  for (int yy=0; yy<fin; yy+=xStrideIn)
    {
      PteIn=In+yy;
      PteOut=Out+yy;
      //filtrage 8 premiers octets
      Mmmx_101xGradient1(PteIn,PteOut,xStrideIn);
      PteIn+=8;
      PteOut+=8;
      for (int xx=8; xx<x-8;xx+=8)
	{
	  Mmmx_101xGradient2(PteIn,PteOut,xStrideIn);
	  PteIn+=8;
	  PteOut+=8;
	}
      //filtrage des 8 derniers octets
      Mmmx_101xGradient3(PteIn,PteOut,xStrideIn);
    }
}
#else
static inline void Mmmx_101yGradient(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
static inline void Mmmx_101xGradient(const unsigned char* In, int xStrideIn,
			   int x, int y, unsigned char* Out, int xStrideOut) {}
#endif

void MCharImage::Gradient_101(int IsHorizontal0Vertical1,
			       const MCharImage& Data) {
  SetSize(Data.x__,Data.y__);
  if (IsHorizontal0Vertical1)
    {
      //Gradient vertical
      if (MWithMmxSseUse()) {
	Mmmx_101yGradient(Data.yuv, x__, x__, y__, yuv, x__);
	Memms(); // now FPU can be used
      } else{
	for (int xx=0; xx<x__; xx++)
	  {
	    unsigned char *PteIn=Data.yuv+xx;
	    unsigned char *PteOut=yuv+xx;
	    for (int yy=1;yy<y__-1;yy++)
	      {
		*(PteOut)=128-(*(PteIn-x__))/2+(*(PteIn+x__))/2;
		PteIn+=x__;
		PteOut+=x__;
	      }
	  }
      }
    }
  else
    {
      //Gradient horizontal
      if (MWithMmxSseUse()) {
	Mmmx_101xGradient(Data.yuv, x__, x__, y__, yuv, x__);
	Memms(); // now FPU can be used
      } else{
	for (int yy=0; yy<y__; yy++)
	  {
	    unsigned char *PteIn=Data.yuv+yy*x__+1;
	    unsigned char *PteOut=yuv+yy*x__+1;
	    for (int xx=1;xx<x__-1;xx++)
	      {
		*(PteOut)=128-(*(PteIn-1))/2+(*(PteIn+1))/2;
		PteIn++;
		PteOut++;
	      }
	  }
      }
    }
  NoColor();
}


//----------------------------------------------------------------------------------------
//  AbsGradient_101
//----------------------------------------------------------------------------------------
#ifdef ARCH_X86
static inline void MmmxAbsGradient_101y1(const unsigned char *In,unsigned char *Out,int xStride)
{
  movq_m2r(*(In-xStride),mm0);
  movq_m2r(*(In+xStride),mm1);
  movq_r2r(mm1,mm2);

  psubusb_r2r(mm0,mm1);
  psubusb_r2r(mm2,mm0);

  pmaxub_r2r(mm0,mm1);
  movq_r2m(mm1,*Out);
}
static inline void MmmxAbsGradient_101y(const unsigned char *In,unsigned char *Out,int x,int y,int xStride)
{
  for (int yy=1;yy<y-1;yy++)
    {
      const unsigned char* PtIn=In+yy*x;
      unsigned char* PtOut=Out+yy*x;
      for (int xx=0;xx<x;xx+=8)
	{
	  MmmxAbsGradient_101y1(PtIn,PtOut,xStride);
	  PtIn+=8;
	  PtOut+=8;
	}
    }

}
static inline void MmmxAbsGradient_101x(const unsigned char *In,unsigned char *Out,int x,int y,int xStride)
{

}
#else
static inline void MmmxAbsGradient_101y(const unsigned char *In,unsigned char *Out,int x,int y,int xStride) {}
static inline void MmmxAbsGradient_101x(const unsigned char *In,unsigned char *Out,int x,int y,int xStride) {}
#endif

void MCharImage::AbsGradient_101(int IsHorizontal0Vertical1,
			       const MCharImage& Data) {
  SetSize(Data.x__,Data.y__);
  if (IsHorizontal0Vertical1)
    {
      //Gradient vertical
      if (MWithMmxSseUse()) {
	MmmxAbsGradient_101y(Data.yuv, yuv, x__, y__, x__);
	Memms(); // now FPU can be used
	} else {
	for (int xx=0; xx<x__; xx++)
	  {
	    unsigned char *PteIn=Data.yuv+xx;
	    unsigned char *PteOut=yuv+xx;
	    for (int yy=1;yy<y__-1;yy++)
	      {
		*(PteOut)=abs(*(PteIn+x__)-*(PteIn-x__));
		PteIn+=x__;
		PteOut+=x__;
	      }
	  }
      }
    }
  else
    {
      //Gradient horizontal
      /*if (MWithMmxSseUse()) {
	Mmmx_101xGradient(Data.yuv, x__, x__, y__, yuv, x__);
	Memms(); // now FPU can be used
	} else */{
	for (int yy=0; yy<y__; yy++)
	  {
	    unsigned char *PteIn=Data.yuv+yy*x__+1;
	    unsigned char *PteOut=yuv+yy*x__+1;
	    for (int xx=1;xx<x__-1;xx++)
	      {
		*(PteOut)=abs(*(PteIn+1)-*(PteIn-1));
		PteIn++;
		PteOut++;
	      }
	  }
      }
    }
  NoColor();
}


//-----------------------------------------------------------------------------
// EgaliseHistogramme
//-----------------------------------------------------------------------------
void MCharImage::EgaliseHistogramme(unsigned char GrisMin,unsigned char GrisMax)
{
	//Calcul de l'histogramme
	int* Histo=new int[GrisMax-GrisMin+1];
	int* Transfo=new int[GrisMax-GrisMin+1];

  for (int g=0;g<(GrisMax-GrisMin+1);g++) Histo[g]=0;
	int NbGris=0;
	for (int i=0;i<x__;i++)
	{
		for (int j=0;j<y__;j++)
		{
			unsigned char g=yuv[i+j*x__];
			if ((g>=GrisMin) && (g<=GrisMax))
			{
				Histo[g-GrisMin]+=1;
				NbGris++;
			}
		}
	}

	//Calcul de la transformation des niveaux de gris
	int NbNiveau=NbGris/(GrisMax-GrisMin+1);
  int gdest=0;
  int SommeDest=0;
	for (int g=0;g<(GrisMax-GrisMin+1);g++)
	{
		SommeDest+=Histo[g];
 		while ((SommeDest>NbNiveau*(gdest+1)) && gdest<GrisMax) gdest++;
		Transfo[g]=gdest;
	}

  //Application sur l'image
	for (int i=0;i<x__;i++)
	{
		for (int j=0;j<y__;j++)
		{
			unsigned char g=yuv[i+j*x__];
			if ((g>=GrisMin) && (g<=GrisMax))
			{
        yuv[i+j*x__]=Transfo[g-GrisMin];
      }
    }
  }  
	delete [] Histo;
	delete [] Transfo;
}


