webrtc_H265player/test/prod_decoder/codec/3rdparty/tinyh264/h264bsd_reconstruct.c


								/*

								 * Copyright (C) 2009 The Android Open Source Project

								 * Modified for use by h264bsd standalone library

								 *

								 * Licensed under the Apache License, Version 2.0 (the "License");

								 * you may not use this file except in compliance with the License.

								 * You may obtain a copy of the License at

								 *

								 *      http://www.apache.org/licenses/LICENSE-2.0

								 *

								 * Unless required by applicable law or agreed to in writing, software

								 * distributed under the License is distributed on an "AS IS" BASIS,

								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								 * See the License for the specific language governing permissions and

								 * limitations under the License.

								 */


								/*------------------------------------------------------------------------------


								    Table of contents


								     1. Include headers

								     2. External compiler flags

								     3. Module defines

								     4. Local function prototypes

								     5. Functions


								------------------------------------------------------------------------------*/


								/*------------------------------------------------------------------------------

								    1. Include headers

								------------------------------------------------------------------------------*/


								#include "basetype.h"

								#include "h264bsd_reconstruct.h"

								#include "h264bsd_macroblock_layer.h"

								#include "h264bsd_image.h"

								#include "h264bsd_util.h"


								#ifdef H264DEC_OMXDL

								#include "omxtypes.h"

								#include "omxVC.h"

								#include "armVC.h"

								#endif /* H264DEC_OMXDL */


								/*------------------------------------------------------------------------------

								    2. External compiler flags

								--------------------------------------------------------------------------------


								--------------------------------------------------------------------------------

								    3. Module defines

								------------------------------------------------------------------------------*/


								/* Switch off the following Lint messages for this file:

								 * Info 701: Shift left of signed quantity (int)

								 * Info 702: Shift right of signed quantity (int)

								 */

								/*lint -e701 -e702 */


								/* Luma fractional-sample positions

								 *

								 *  G a b c H

								 *  d e f g

								 *  h i j k m

								 *  n p q r

								 *  M   s   N

								 *

								 *  G, H, M and N are integer sample positions

								 *  a-s are fractional samples that need to be interpolated.

								 */

								#ifndef H264DEC_OMXDL

								static const u32 lumaFracPos[4][4] = {

								  /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */

								    {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};

								#endif /* H264DEC_OMXDL */


								/* clipping table, defined in h264bsd_intra_prediction.c */

								extern const u8 h264bsdClip[];


								/*------------------------------------------------------------------------------

								    4. Local function prototypes

								------------------------------------------------------------------------------*/


								#ifndef H264DEC_OMXDL


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateChromaHor


								        Functional description:

								          This function performs chroma interpolation in horizontal direction.

								          Overfilling is done only if needed. Reference image (pRef) is

								          read at correct position and the predicted part is written to

								          macroblock's chrominance (predPartChroma)

								        Inputs:

								          pRef              pointer to reference frame Cb top-left corner

								          x0                integer x-coordinate for prediction

								          y0                integer y-coordinate for prediction

								          width             width of the reference frame chrominance in pixels

								          height            height of the reference frame chrominance in pixels

								          xFrac             horizontal fraction for prediction in 1/8 pixels

								          chromaPartWidth   width of the predicted part in pixels

								          chromaPartHeight  height of the predicted part in pixels

								        Outputs:

								          predPartChroma    pointer where predicted part is written


								------------------------------------------------------------------------------*/

								#ifndef H264DEC_ARM11

								void h264bsdInterpolateChromaHor(

								  u8 *pRef,

								  u8 *predPartChroma,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 xFrac,

								  u32 chromaPartWidth,

								  u32 chromaPartHeight)

								{


								/* Variables */


								    u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;

								    u8 *ptrA, *cbr;

								    u32 comp;

								    u8 block[9*8*2];


								/* Code */


								    ASSERT(predPartChroma);

								    ASSERT(chromaPartWidth);

								    ASSERT(chromaPartHeight);

								    ASSERT(xFrac < 8);

								    ASSERT(pRef);


								    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||

								        (y0 < 0) || ((u32)y0+chromaPartHeight > height))

								    {

								        h264bsdFillBlock(pRef, block, x0, y0, width, height,

								            chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);

								        pRef += width * height;

								        h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,

								            x0, y0, width, height, chromaPartWidth + 1,

								            chromaPartHeight, chromaPartWidth + 1);


								        pRef = block;

								        x0 = 0;

								        y0 = 0;

								        width = chromaPartWidth+1;

								        height = chromaPartHeight;

								    }


								    val = 8 - xFrac;


								    for (comp = 0; comp <= 1; comp++)

								    {


								        ptrA = pRef + (comp * height + (u32)y0) * width + x0;

								        cbr = predPartChroma + comp * 8 * 8;


								        /* 2x2 pels per iteration

								         * bilinear horizontal interpolation */

								        for (y = (chromaPartHeight >> 1); y; y--)

								        {

								            for (x = (chromaPartWidth >> 1); x; x--)

								            {

								                tmp1 = ptrA[width];

								                tmp2 = *ptrA++;

								                tmp3 = ptrA[width];

								                tmp4 = *ptrA++;

								                c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;

								                c >>= 6;

								                cbr[8] = (u8)c;

								                c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;

								                c >>= 6;

								                *cbr++ = (u8)c;

								                tmp1 = ptrA[width];

								                tmp2 = *ptrA;

								                c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;

								                c >>= 6;

								                cbr[8] = (u8)c;

								                c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;

								                c >>= 6;

								                *cbr++ = (u8)c;

								            }

								            cbr += 2*8 - chromaPartWidth;

								            ptrA += 2*width - chromaPartWidth;

								        }

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateChromaVer


								        Functional description:

								          This function performs chroma interpolation in vertical direction.

								          Overfilling is done only if needed. Reference image (pRef) is

								          read at correct position and the predicted part is written to

								          macroblock's chrominance (predPartChroma)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateChromaVer(

								  u8 *pRef,

								  u8 *predPartChroma,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 yFrac,

								  u32 chromaPartWidth,

								  u32 chromaPartHeight)

								{


								/* Variables */


								    u32 x, y, tmp1, tmp2, tmp3, c, val;

								    u8 *ptrA, *cbr;

								    u32 comp;

								    u8 block[9*8*2];


								/* Code */


								    ASSERT(predPartChroma);

								    ASSERT(chromaPartWidth);

								    ASSERT(chromaPartHeight);

								    ASSERT(yFrac < 8);

								    ASSERT(pRef);


								    if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||

								        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))

								    {

								        h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,

								            chromaPartHeight + 1, chromaPartWidth);

								        pRef += width * height;

								        h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),

								            x0, y0, width, height, chromaPartWidth,

								            chromaPartHeight + 1, chromaPartWidth);


								        pRef = block;

								        x0 = 0;

								        y0 = 0;

								        width = chromaPartWidth;

								        height = chromaPartHeight+1;

								    }


								    val = 8 - yFrac;


								    for (comp = 0; comp <= 1; comp++)

								    {


								        ptrA = pRef + (comp * height + (u32)y0) * width + x0;

								        cbr = predPartChroma + comp * 8 * 8;


								        /* 2x2 pels per iteration

								         * bilinear vertical interpolation */

								        for (y = (chromaPartHeight >> 1); y; y--)

								        {

								            for (x = (chromaPartWidth >> 1); x; x--)

								            {

								                tmp3 = ptrA[width*2];

								                tmp2 = ptrA[width];

								                tmp1 = *ptrA++;

								                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;

								                c >>= 6;

								                cbr[8] = (u8)c;

								                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;

								                c >>= 6;

								                *cbr++ = (u8)c;

								                tmp3 = ptrA[width*2];

								                tmp2 = ptrA[width];

								                tmp1 = *ptrA++;

								                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;

								                c >>= 6;

								                cbr[8] = (u8)c;

								                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;

								                c >>= 6;

								                *cbr++ = (u8)c;

								            }

								            cbr += 2*8 - chromaPartWidth;

								            ptrA += 2*width - chromaPartWidth;

								        }

								    }


								}

								#endif

								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateChromaHorVer


								        Functional description:

								          This function performs chroma interpolation in horizontal and

								          vertical direction. Overfilling is done only if needed. Reference

								          image (ref) is read at correct position and the predicted part

								          is written to macroblock's chrominance (predPartChroma)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateChromaHorVer(

								  u8 *ref,

								  u8 *predPartChroma,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 xFrac,

								  u32 yFrac,

								  u32 chromaPartWidth,

								  u32 chromaPartHeight)

								{

								    u8 block[9*9*2];

								    u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;

								    u32 comp;

								    u8 *ptrA, *cbr;


								/* Code */


								    ASSERT(predPartChroma);

								    ASSERT(chromaPartWidth);

								    ASSERT(chromaPartHeight);

								    ASSERT(xFrac < 8);

								    ASSERT(yFrac < 8);

								    ASSERT(ref);


								    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||

								        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))

								    {

								        h264bsdFillBlock(ref, block, x0, y0, width, height,

								            chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);

								        ref += width * height;

								        h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),

								            x0, y0, width, height, chromaPartWidth + 1,

								            chromaPartHeight + 1, chromaPartWidth + 1);


								        ref = block;

								        x0 = 0;

								        y0 = 0;

								        width = chromaPartWidth+1;

								        height = chromaPartHeight+1;

								    }


								    valX = 8 - xFrac;

								    valY = 8 - yFrac;


								    for (comp = 0; comp <= 1; comp++)

								    {


								        ptrA = ref + (comp * height + (u32)y0) * width + x0;

								        cbr = predPartChroma + comp * 8 * 8;


								        /* 2x2 pels per iteration

								         * bilinear vertical and horizontal interpolation */

								        for (y = (chromaPartHeight >> 1); y; y--)

								        {

								            tmp1 = *ptrA;

								            tmp3 = ptrA[width];

								            tmp5 = ptrA[width*2];

								            tmp1 *= valY;

								            tmp1 += tmp3 * yFrac;

								            tmp3 *= valY;

								            tmp3 += tmp5 * yFrac;

								            for (x = (chromaPartWidth >> 1); x; x--)

								            {

								                tmp2 = *++ptrA;

								                tmp4 = ptrA[width];

								                tmp6 = ptrA[width*2];

								                tmp2 *= valY;

								                tmp2 += tmp4 * yFrac;

								                tmp4 *= valY;

								                tmp4 += tmp6 * yFrac;

								                tmp1 = tmp1 * valX + plus32;

								                tmp3 = tmp3 * valX + plus32;

								                tmp1 += tmp2 * xFrac;

								                tmp1 >>= 6;

								                tmp3 += tmp4 * xFrac;

								                tmp3 >>= 6;

								                cbr[8] = (u8)tmp3;

								                *cbr++ = (u8)tmp1;


								                tmp1 = *++ptrA;

								                tmp3 = ptrA[width];

								                tmp5 = ptrA[width*2];

								                tmp1 *= valY;

								                tmp1 += tmp3 * yFrac;

								                tmp3 *= valY;

								                tmp3 += tmp5 * yFrac;

								                tmp2 = tmp2 * valX + plus32;

								                tmp4 = tmp4 * valX + plus32;

								                tmp2 += tmp1 * xFrac;

								                tmp2 >>= 6;

								                tmp4 += tmp3 * xFrac;

								                tmp4 >>= 6;

								                cbr[8] = (u8)tmp4;

								                *cbr++ = (u8)tmp2;

								            }

								            cbr += 2*8 - chromaPartWidth;

								            ptrA += 2*width - chromaPartWidth;

								        }

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: PredictChroma


								        Functional description:

								          Top level chroma prediction function that calls the appropriate

								          interpolation function. The output is written to macroblock array.


								------------------------------------------------------------------------------*/


								static void PredictChroma(

								  u8 *mbPartChroma,

								  u32 xAL,

								  u32 yAL,

								  u32 partWidth,

								  u32 partHeight,

								  mv_t *mv,

								  image_t *refPic)

								{


								/* Variables */


								    u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;

								    i32 xInt, yInt;

								    u8 *ref;


								/* Code */


								    ASSERT(mv);

								    ASSERT(refPic);

								    ASSERT(refPic->data);

								    ASSERT(refPic->width);

								    ASSERT(refPic->height);


								    width  = 8 * refPic->width;

								    height = 8 * refPic->height;


								    xInt = (xAL >> 1) + (mv->hor >> 3);

								    yInt = (yAL >> 1) + (mv->ver >> 3);

								    xFrac = mv->hor & 0x7;

								    yFrac = mv->ver & 0x7;


								    chromaPartWidth  = partWidth >> 1;

								    chromaPartHeight = partHeight >> 1;

								    ref = refPic->data + 256 * refPic->width * refPic->height;


								    if (xFrac && yFrac)

								    {

								        h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,

								                height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);

								    }

								    else if (xFrac)

								    {

								        h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,

								                height, xFrac, chromaPartWidth, chromaPartHeight);

								    }

								    else if (yFrac)

								    {

								        h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,

								                height, yFrac, chromaPartWidth, chromaPartHeight);

								    }

								    else

								    {

								        h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,

								            chromaPartWidth, chromaPartHeight, 8);

								        ref += width * height;

								        h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,

								            chromaPartWidth, chromaPartHeight, 8);

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateVerHalf


								        Functional description:

								          Function to perform vertical interpolation of pixel position 'h'

								          for a block. Overfilling is done only if needed. Reference

								          image (ref) is read at correct position and the predicted part

								          is written to macroblock array (mb)


								------------------------------------------------------------------------------*/

								#ifndef H264DEC_ARM11

								void h264bsdInterpolateVerHalf(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight)

								{

								    u32 p1[21*21/4+1];

								    u32 i, j;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    u8 *ptrC, *ptrV;

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth > width) ||

								        (y0 < 0) || ((u32)y0+partHeight+5 > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth, partHeight+5, partWidth);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    ptrC = ref + width;

								    ptrV = ptrC + 5*width;


								    /* 4 pixels per iteration, interpolate using 5 vertical samples */

								    for (i = (partHeight >> 2); i; i--)

								    {

								        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */

								        for (j = partWidth; j; j--)

								        {

								            tmp4 = ptrV[-(i32)width*2];

								            tmp5 = ptrV[-(i32)width];

								            tmp1 = ptrV[width];

								            tmp2 = ptrV[width*2];

								            tmp6 = *ptrV++;


								            tmp7 = tmp4 + tmp1;

								            tmp2 -= (tmp7 << 2);

								            tmp2 -= tmp7;

								            tmp2 += 16;

								            tmp7 = tmp5 + tmp6;

								            tmp3 = ptrC[width*2];

								            tmp2 += (tmp7 << 4);

								            tmp2 += (tmp7 << 2);

								            tmp2 += tmp3;

								            tmp2 = clp[tmp2>>5];

								            tmp1 += 16;

								            mb[48] = (u8)tmp2;


								            tmp7 = tmp3 + tmp6;

								            tmp1 -= (tmp7 << 2);

								            tmp1 -= tmp7;

								            tmp7 = tmp4 + tmp5;

								            tmp2 = ptrC[width];

								            tmp1 += (tmp7 << 4);

								            tmp1 += (tmp7 << 2);

								            tmp1 += tmp2;

								            tmp1 = clp[tmp1>>5];

								            tmp6 += 16;

								            mb[32] = (u8)tmp1;


								            tmp7 = tmp2 + tmp5;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = tmp4 + tmp3;

								            tmp1 = *ptrC;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6>>5];

								            tmp5 += 16;

								            mb[16] = (u8)tmp6;


								            tmp1 += tmp4;

								            tmp5 -= (tmp1 << 2);

								            tmp5 -= tmp1;

								            tmp3 += tmp2;

								            tmp6 = ptrC[-(i32)width];

								            tmp5 += (tmp3 << 4);

								            tmp5 += (tmp3 << 2);

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5>>5];

								            *mb++ = (u8)tmp5;

								            ptrC++;

								        }

								        ptrC += 4*width - partWidth;

								        ptrV += 4*width - partWidth;

								        mb += 4*16 - partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateVerQuarter


								        Functional description:

								          Function to perform vertical interpolation of pixel position 'd'

								          or 'n' for a block. Overfilling is done only if needed. Reference

								          image (ref) is read at correct position and the predicted part

								          is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateVerQuarter(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight,

								  u32 verOffset)    /* 0 for pixel d, 1 for pixel n */

								{

								    u32 p1[21*21/4+1];

								    u32 i, j;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    u8 *ptrC, *ptrV, *ptrInt;

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth > width) ||

								        (y0 < 0) || ((u32)y0+partHeight+5 > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth, partHeight+5, partWidth);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    ptrC = ref + width;

								    ptrV = ptrC + 5*width;


								    /* Pointer to integer sample position, either M or R */

								    ptrInt = ptrC + (2+verOffset)*width;


								    /* 4 pixels per iteration

								     * interpolate using 5 vertical samples and average between

								     * interpolated value and integer sample value */

								    for (i = (partHeight >> 2); i; i--)

								    {

								        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */

								        for (j = partWidth; j; j--)

								        {

								            tmp4 = ptrV[-(i32)width*2];

								            tmp5 = ptrV[-(i32)width];

								            tmp1 = ptrV[width];

								            tmp2 = ptrV[width*2];

								            tmp6 = *ptrV++;


								            tmp7 = tmp4 + tmp1;

								            tmp2 -= (tmp7 << 2);

								            tmp2 -= tmp7;

								            tmp2 += 16;

								            tmp7 = tmp5 + tmp6;

								            tmp3 = ptrC[width*2];

								            tmp2 += (tmp7 << 4);

								            tmp2 += (tmp7 << 2);

								            tmp2 += tmp3;

								            tmp2 = clp[tmp2>>5];

								            tmp7 = ptrInt[width*2];

								            tmp1 += 16;

								            tmp2++;

								            mb[48] = (u8)((tmp2 + tmp7) >> 1);


								            tmp7 = tmp3 + tmp6;

								            tmp1 -= (tmp7 << 2);

								            tmp1 -= tmp7;

								            tmp7 = tmp4 + tmp5;

								            tmp2 = ptrC[width];

								            tmp1 += (tmp7 << 4);

								            tmp1 += (tmp7 << 2);

								            tmp1 += tmp2;

								            tmp1 = clp[tmp1>>5];

								            tmp7 = ptrInt[width];

								            tmp6 += 16;

								            tmp1++;

								            mb[32] = (u8)((tmp1 + tmp7) >> 1);


								            tmp7 = tmp2 + tmp5;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = tmp4 + tmp3;

								            tmp1 = *ptrC;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6>>5];

								            tmp7 = *ptrInt;

								            tmp5 += 16;

								            tmp6++;

								            mb[16] = (u8)((tmp6 + tmp7) >> 1);


								            tmp1 += tmp4;

								            tmp5 -= (tmp1 << 2);

								            tmp5 -= tmp1;

								            tmp3 += tmp2;

								            tmp6 = ptrC[-(i32)width];

								            tmp5 += (tmp3 << 4);

								            tmp5 += (tmp3 << 2);

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5>>5];

								            tmp7 = ptrInt[-(i32)width];

								            tmp5++;

								            *mb++ = (u8)((tmp5 + tmp7) >> 1);

								            ptrC++;

								            ptrInt++;

								        }

								        ptrC += 4*width - partWidth;

								        ptrV += 4*width - partWidth;

								        ptrInt += 4*width - partWidth;

								        mb += 4*16 - partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateHorHalf


								        Functional description:

								          Function to perform horizontal interpolation of pixel position 'b'

								          for a block. Overfilling is done only if needed. Reference

								          image (ref) is read at correct position and the predicted part

								          is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateHorHalf(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight)

								{

								    u32 p1[21*21/4+1];

								    u8 *ptrJ;

								    u32 x, y;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);

								    ASSERT((partWidth&0x3) == 0);

								    ASSERT((partHeight&0x3) == 0);


								    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||

								        (y0 < 0) || ((u32)y0+partHeight > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth+5, partHeight, partWidth+5);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth + 5;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    ptrJ = ref + 5;


								    for (y = partHeight; y; y--)

								    {

								        tmp6 = *(ptrJ - 5);

								        tmp5 = *(ptrJ - 4);

								        tmp4 = *(ptrJ - 3);

								        tmp3 = *(ptrJ - 2);

								        tmp2 = *(ptrJ - 1);


								        /* calculate 4 pels per iteration */

								        for (x = (partWidth >> 2); x; x--)

								        {

								            /* First pixel */

								            tmp6 += 16;

								            tmp7 = tmp3 + tmp4;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = tmp2 + tmp5;

								            tmp1 = *ptrJ++;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6>>5];

								            /* Second pixel */

								            tmp5 += 16;

								            tmp7 = tmp2 + tmp3;

								            *mb++ = (u8)tmp6;

								            tmp5 += (tmp7 << 4);

								            tmp5 += (tmp7 << 2);

								            tmp7 = tmp1 + tmp4;

								            tmp6 = *ptrJ++;

								            tmp5 -= (tmp7 << 2);

								            tmp5 -= tmp7;

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5>>5];

								            /* Third pixel */

								            tmp4 += 16;

								            tmp7 = tmp1 + tmp2;

								            *mb++ = (u8)tmp5;

								            tmp4 += (tmp7 << 4);

								            tmp4 += (tmp7 << 2);

								            tmp7 = tmp6 + tmp3;

								            tmp5 = *ptrJ++;

								            tmp4 -= (tmp7 << 2);

								            tmp4 -= tmp7;

								            tmp4 += tmp5;

								            tmp4 = clp[tmp4>>5];

								            /* Fourth pixel */

								            tmp3 += 16;

								            tmp7 = tmp6 + tmp1;

								            *mb++ = (u8)tmp4;

								            tmp3 += (tmp7 << 4);

								            tmp3 += (tmp7 << 2);

								            tmp7 = tmp5 + tmp2;

								            tmp4 = *ptrJ++;

								            tmp3 -= (tmp7 << 2);

								            tmp3 -= tmp7;

								            tmp3 += tmp4;

								            tmp3 = clp[tmp3>>5];

								            tmp7 = tmp4;

								            tmp4 = tmp6;

								            tmp6 = tmp2;

								            tmp2 = tmp7;

								            *mb++ = (u8)tmp3;

								            tmp3 = tmp5;

								            tmp5 = tmp1;

								        }

								        ptrJ += width - partWidth;

								        mb += 16 - partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateHorQuarter


								        Functional description:

								          Function to perform horizontal interpolation of pixel position 'a'

								          or 'c' for a block. Overfilling is done only if needed. Reference

								          image (ref) is read at correct position and the predicted part

								          is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateHorQuarter(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight,

								  u32 horOffset) /* 0 for pixel a, 1 for pixel c */

								{

								    u32 p1[21*21/4+1];

								    u8 *ptrJ;

								    u32 x, y;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||

								        (y0 < 0) || ((u32)y0+partHeight > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth+5, partHeight, partWidth+5);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth + 5;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    ptrJ = ref + 5;


								    for (y = partHeight; y; y--)

								    {

								        tmp6 = *(ptrJ - 5);

								        tmp5 = *(ptrJ - 4);

								        tmp4 = *(ptrJ - 3);

								        tmp3 = *(ptrJ - 2);

								        tmp2 = *(ptrJ - 1);


								        /* calculate 4 pels per iteration */

								        for (x = (partWidth >> 2); x; x--)

								        {

								            /* First pixel */

								            tmp6 += 16;

								            tmp7 = tmp3 + tmp4;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = tmp2 + tmp5;

								            tmp1 = *ptrJ++;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6>>5];

								            tmp5 += 16;

								            if (!horOffset)

								                tmp6 += tmp4;

								            else

								                tmp6 += tmp3;

								            *mb++ = (u8)((tmp6 + 1) >> 1);

								            /* Second pixel */

								            tmp7 = tmp2 + tmp3;

								            tmp5 += (tmp7 << 4);

								            tmp5 += (tmp7 << 2);

								            tmp7 = tmp1 + tmp4;

								            tmp6 = *ptrJ++;

								            tmp5 -= (tmp7 << 2);

								            tmp5 -= tmp7;

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5>>5];

								            tmp4 += 16;

								            if (!horOffset)

								                tmp5 += tmp3;

								            else

								                tmp5 += tmp2;

								            *mb++ = (u8)((tmp5 + 1) >> 1);

								            /* Third pixel */

								            tmp7 = tmp1 + tmp2;

								            tmp4 += (tmp7 << 4);

								            tmp4 += (tmp7 << 2);

								            tmp7 = tmp6 + tmp3;

								            tmp5 = *ptrJ++;

								            tmp4 -= (tmp7 << 2);

								            tmp4 -= tmp7;

								            tmp4 += tmp5;

								            tmp4 = clp[tmp4>>5];

								            tmp3 += 16;

								            if (!horOffset)

								                tmp4 += tmp2;

								            else

								                tmp4 += tmp1;

								            *mb++ = (u8)((tmp4 + 1) >> 1);

								            /* Fourth pixel */

								            tmp7 = tmp6 + tmp1;

								            tmp3 += (tmp7 << 4);

								            tmp3 += (tmp7 << 2);

								            tmp7 = tmp5 + tmp2;

								            tmp4 = *ptrJ++;

								            tmp3 -= (tmp7 << 2);

								            tmp3 -= tmp7;

								            tmp3 += tmp4;

								            tmp3 = clp[tmp3>>5];

								            if (!horOffset)

								                tmp3 += tmp1;

								            else

								                tmp3 += tmp6;

								            *mb++ = (u8)((tmp3 + 1) >> 1);

								            tmp3 = tmp5;

								            tmp5 = tmp1;

								            tmp7 = tmp4;

								            tmp4 = tmp6;

								            tmp6 = tmp2;

								            tmp2 = tmp7;

								        }

								        ptrJ += width - partWidth;

								        mb += 16 - partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateHorVerQuarter


								        Functional description:

								          Function to perform horizontal and vertical interpolation of pixel

								          position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only

								          if needed. Reference image (ref) is read at correct position and

								          the predicted part is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateHorVerQuarter(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight,

								  u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,

								                       2 for pixel p, 3 for pixel r */

								{

								    u32 p1[21*21/4+1];

								    u8 *ptrC, *ptrJ, *ptrV;

								    u32 x, y;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||

								        (y0 < 0) || ((u32)y0+partHeight+5 > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth+5, partHeight+5, partWidth+5);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth+5;

								    }


								    /* Ref points to G + (-2, -2) */

								    ref += (u32)y0 * width + (u32)x0;


								    /* ptrJ points to either J or Q, depending on vertical offset */

								    ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;


								    /* ptrC points to either C or D, depending on horizontal offset */

								    ptrC = ref + width + 2 + (horVerOffset & 0x1);


								    for (y = partHeight; y; y--)

								    {

								        tmp6 = *(ptrJ - 5);

								        tmp5 = *(ptrJ - 4);

								        tmp4 = *(ptrJ - 3);

								        tmp3 = *(ptrJ - 2);

								        tmp2 = *(ptrJ - 1);


								        /* Horizontal interpolation, calculate 4 pels per iteration */

								        for (x = (partWidth >> 2); x; x--)

								        {

								            /* First pixel */

								            tmp6 += 16;

								            tmp7 = tmp3 + tmp4;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = tmp2 + tmp5;

								            tmp1 = *ptrJ++;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6>>5];

								            /* Second pixel */

								            tmp5 += 16;

								            tmp7 = tmp2 + tmp3;

								            *mb++ = (u8)tmp6;

								            tmp5 += (tmp7 << 4);

								            tmp5 += (tmp7 << 2);

								            tmp7 = tmp1 + tmp4;

								            tmp6 = *ptrJ++;

								            tmp5 -= (tmp7 << 2);

								            tmp5 -= tmp7;

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5>>5];

								            /* Third pixel */

								            tmp4 += 16;

								            tmp7 = tmp1 + tmp2;

								            *mb++ = (u8)tmp5;

								            tmp4 += (tmp7 << 4);

								            tmp4 += (tmp7 << 2);

								            tmp7 = tmp6 + tmp3;

								            tmp5 = *ptrJ++;

								            tmp4 -= (tmp7 << 2);

								            tmp4 -= tmp7;

								            tmp4 += tmp5;

								            tmp4 = clp[tmp4>>5];

								            /* Fourth pixel */

								            tmp3 += 16;

								            tmp7 = tmp6 + tmp1;

								            *mb++ = (u8)tmp4;

								            tmp3 += (tmp7 << 4);

								            tmp3 += (tmp7 << 2);

								            tmp7 = tmp5 + tmp2;

								            tmp4 = *ptrJ++;

								            tmp3 -= (tmp7 << 2);

								            tmp3 -= tmp7;

								            tmp3 += tmp4;

								            tmp3 = clp[tmp3>>5];

								            tmp7 = tmp4;

								            tmp4 = tmp6;

								            tmp6 = tmp2;

								            tmp2 = tmp7;

								            *mb++ = (u8)tmp3;

								            tmp3 = tmp5;

								            tmp5 = tmp1;

								        }

								        ptrJ += width - partWidth;

								        mb += 16 - partWidth;

								    }


								    mb -= 16*partHeight;

								    ptrV = ptrC + 5*width;


								    for (y = (partHeight >> 2); y; y--)

								    {

								        /* Vertical interpolation and averaging, 4 pels per iteration */

								        for (x = partWidth; x; x--)

								        {

								            tmp4 = ptrV[-(i32)width*2];

								            tmp5 = ptrV[-(i32)width];

								            tmp1 = ptrV[width];

								            tmp2 = ptrV[width*2];

								            tmp6 = *ptrV++;


								            tmp7 = tmp4 + tmp1;

								            tmp2 -= (tmp7 << 2);

								            tmp2 -= tmp7;

								            tmp2 += 16;

								            tmp7 = tmp5 + tmp6;

								            tmp3 = ptrC[width*2];

								            tmp2 += (tmp7 << 4);

								            tmp2 += (tmp7 << 2);

								            tmp2 += tmp3;

								            tmp7 = clp[tmp2>>5];

								            tmp2 = mb[48];

								            tmp1 += 16;

								            tmp7++;

								            mb[48] = (u8)((tmp2 + tmp7) >> 1);


								            tmp7 = tmp3 + tmp6;

								            tmp1 -= (tmp7 << 2);

								            tmp1 -= tmp7;

								            tmp7 = tmp4 + tmp5;

								            tmp2 = ptrC[width];

								            tmp1 += (tmp7 << 4);

								            tmp1 += (tmp7 << 2);

								            tmp1 += tmp2;

								            tmp7 = clp[tmp1>>5];

								            tmp1 = mb[32];

								            tmp6 += 16;

								            tmp7++;

								            mb[32] = (u8)((tmp1 + tmp7) >> 1);


								            tmp1 = *ptrC;

								            tmp7 = tmp2 + tmp5;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = tmp4 + tmp3;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp6 += tmp1;

								            tmp7 = clp[tmp6>>5];

								            tmp6 = mb[16];

								            tmp5 += 16;

								            tmp7++;

								            mb[16] = (u8)((tmp6 + tmp7) >> 1);


								            tmp6 = ptrC[-(i32)width];

								            tmp1 += tmp4;

								            tmp5 -= (tmp1 << 2);

								            tmp5 -= tmp1;

								            tmp3 += tmp2;

								            tmp5 += (tmp3 << 4);

								            tmp5 += (tmp3 << 2);

								            tmp5 += tmp6;

								            tmp7 = clp[tmp5>>5];

								            tmp5 = *mb;

								            tmp7++;

								            *mb++ = (u8)((tmp5 + tmp7) >> 1);

								            ptrC++;


								        }

								        ptrC += 4*width - partWidth;

								        ptrV += 4*width - partWidth;

								        mb += 4*16 - partWidth;

								    }


								}

								#endif


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateMidHalf


								        Functional description:

								          Function to perform horizontal and vertical interpolation of pixel

								          position 'j' for a block. Overfilling is done only if needed.

								          Reference image (ref) is read at correct position and the predicted

								          part is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateMidHalf(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight)

								{

								    u32 p1[21*21/4+1];

								    u32 x, y;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    i32 *ptrC, *ptrV, *b1;

								    u8  *ptrJ;

								    i32 table[21*16];

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||

								        (y0 < 0) || ((u32)y0+partHeight+5 > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth+5, partHeight+5, partWidth+5);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth+5;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    b1 = table;

								    ptrJ = ref + 5;


								    /* First step: calculate intermediate values for

								     * horizontal interpolation */

								    for (y = partHeight + 5; y; y--)

								    {

								        tmp6 = *(ptrJ - 5);

								        tmp5 = *(ptrJ - 4);

								        tmp4 = *(ptrJ - 3);

								        tmp3 = *(ptrJ - 2);

								        tmp2 = *(ptrJ - 1);


								        /* 4 pels per iteration */

								        for (x = (partWidth >> 2); x; x--)

								        {

								            /* First pixel */

								            tmp7 = tmp3 + tmp4;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = tmp2 + tmp5;

								            tmp1 = *ptrJ++;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp6 += tmp1;

								            *b1++ = tmp6;

								            /* Second pixel */

								            tmp7 = tmp2 + tmp3;

								            tmp5 += (tmp7 << 4);

								            tmp5 += (tmp7 << 2);

								            tmp7 = tmp1 + tmp4;

								            tmp6 = *ptrJ++;

								            tmp5 -= (tmp7 << 2);

								            tmp5 -= tmp7;

								            tmp5 += tmp6;

								            *b1++ = tmp5;

								            /* Third pixel */

								            tmp7 = tmp1 + tmp2;

								            tmp4 += (tmp7 << 4);

								            tmp4 += (tmp7 << 2);

								            tmp7 = tmp6 + tmp3;

								            tmp5 = *ptrJ++;

								            tmp4 -= (tmp7 << 2);

								            tmp4 -= tmp7;

								            tmp4 += tmp5;

								            *b1++ = tmp4;

								            /* Fourth pixel */

								            tmp7 = tmp6 + tmp1;

								            tmp3 += (tmp7 << 4);

								            tmp3 += (tmp7 << 2);

								            tmp7 = tmp5 + tmp2;

								            tmp4 = *ptrJ++;

								            tmp3 -= (tmp7 << 2);

								            tmp3 -= tmp7;

								            tmp3 += tmp4;

								            *b1++ = tmp3;

								            tmp7 = tmp4;

								            tmp4 = tmp6;

								            tmp6 = tmp2;

								            tmp2 = tmp7;

								            tmp3 = tmp5;

								            tmp5 = tmp1;

								        }

								        ptrJ += width - partWidth;

								    }


								    /* Second step: calculate vertical interpolation */

								    ptrC = table + partWidth;

								    ptrV = ptrC + 5*partWidth;

								    for (y = (partHeight >> 2); y; y--)

								    {

								        /* 4 pels per iteration */

								        for (x = partWidth; x; x--)

								        {

								            tmp4 = ptrV[-(i32)partWidth*2];

								            tmp5 = ptrV[-(i32)partWidth];

								            tmp1 = ptrV[partWidth];

								            tmp2 = ptrV[partWidth*2];

								            tmp6 = *ptrV++;


								            tmp7 = tmp4 + tmp1;

								            tmp2 -= (tmp7 << 2);

								            tmp2 -= tmp7;

								            tmp2 += 512;

								            tmp7 = tmp5 + tmp6;

								            tmp3 = ptrC[partWidth*2];

								            tmp2 += (tmp7 << 4);

								            tmp2 += (tmp7 << 2);

								            tmp2 += tmp3;

								            tmp7 = clp[tmp2>>10];

								            tmp1 += 512;

								            mb[48] = (u8)tmp7;


								            tmp7 = tmp3 + tmp6;

								            tmp1 -= (tmp7 << 2);

								            tmp1 -= tmp7;

								            tmp7 = tmp4 + tmp5;

								            tmp2 = ptrC[partWidth];

								            tmp1 += (tmp7 << 4);

								            tmp1 += (tmp7 << 2);

								            tmp1 += tmp2;

								            tmp7 = clp[tmp1>>10];

								            tmp6 += 512;

								            mb[32] = (u8)tmp7;


								            tmp1 = *ptrC;

								            tmp7 = tmp2 + tmp5;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = tmp4 + tmp3;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp6 += tmp1;

								            tmp7 = clp[tmp6>>10];

								            tmp5 += 512;

								            mb[16] = (u8)tmp7;


								            tmp6 = ptrC[-(i32)partWidth];

								            tmp1 += tmp4;

								            tmp5 -= (tmp1 << 2);

								            tmp5 -= tmp1;

								            tmp3 += tmp2;

								            tmp5 += (tmp3 << 4);

								            tmp5 += (tmp3 << 2);

								            tmp5 += tmp6;

								            tmp7 = clp[tmp5>>10];

								            *mb++ = (u8)tmp7;

								            ptrC++;

								        }

								        mb += 4*16 - partWidth;

								        ptrC += 3*partWidth;

								        ptrV += 3*partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateMidVerQuarter


								        Functional description:

								          Function to perform horizontal and vertical interpolation of pixel

								          position 'f' or 'q' for a block. Overfilling is done only if needed.

								          Reference image (ref) is read at correct position and the predicted

								          part is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateMidVerQuarter(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight,

								  u32 verOffset)    /* 0 for pixel f, 1 for pixel q */

								{

								    u32 p1[21*21/4+1];

								    u32 x, y;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    i32 *ptrC, *ptrV, *ptrInt, *b1;

								    u8  *ptrJ;

								    i32 table[21*16];

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||

								        (y0 < 0) || ((u32)y0+partHeight+5 > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth+5, partHeight+5, partWidth+5);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth+5;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    b1 = table;

								    ptrJ = ref + 5;


								    /* First step: calculate intermediate values for

								     * horizontal interpolation */

								    for (y = partHeight + 5; y; y--)

								    {

								        tmp6 = *(ptrJ - 5);

								        tmp5 = *(ptrJ - 4);

								        tmp4 = *(ptrJ - 3);

								        tmp3 = *(ptrJ - 2);

								        tmp2 = *(ptrJ - 1);

								        for (x = (partWidth >> 2); x; x--)

								        {

								            /* First pixel */

								            tmp7 = tmp3 + tmp4;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = tmp2 + tmp5;

								            tmp1 = *ptrJ++;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp6 += tmp1;

								            *b1++ = tmp6;

								            /* Second pixel */

								            tmp7 = tmp2 + tmp3;

								            tmp5 += (tmp7 << 4);

								            tmp5 += (tmp7 << 2);

								            tmp7 = tmp1 + tmp4;

								            tmp6 = *ptrJ++;

								            tmp5 -= (tmp7 << 2);

								            tmp5 -= tmp7;

								            tmp5 += tmp6;

								            *b1++ = tmp5;

								            /* Third pixel */

								            tmp7 = tmp1 + tmp2;

								            tmp4 += (tmp7 << 4);

								            tmp4 += (tmp7 << 2);

								            tmp7 = tmp6 + tmp3;

								            tmp5 = *ptrJ++;

								            tmp4 -= (tmp7 << 2);

								            tmp4 -= tmp7;

								            tmp4 += tmp5;

								            *b1++ = tmp4;

								            /* Fourth pixel */

								            tmp7 = tmp6 + tmp1;

								            tmp3 += (tmp7 << 4);

								            tmp3 += (tmp7 << 2);

								            tmp7 = tmp5 + tmp2;

								            tmp4 = *ptrJ++;

								            tmp3 -= (tmp7 << 2);

								            tmp3 -= tmp7;

								            tmp3 += tmp4;

								            *b1++ = tmp3;

								            tmp7 = tmp4;

								            tmp4 = tmp6;

								            tmp6 = tmp2;

								            tmp2 = tmp7;

								            tmp3 = tmp5;

								            tmp5 = tmp1;

								        }

								        ptrJ += width - partWidth;

								    }


								    /* Second step: calculate vertical interpolation and average */

								    ptrC = table + partWidth;

								    ptrV = ptrC + 5*partWidth;

								    /* Pointer to integer sample position, either M or R */

								    ptrInt = ptrC + (2+verOffset)*partWidth;

								    for (y = (partHeight >> 2); y; y--)

								    {

								        for (x = partWidth; x; x--)

								        {

								            tmp4 = ptrV[-(i32)partWidth*2];

								            tmp5 = ptrV[-(i32)partWidth];

								            tmp1 = ptrV[partWidth];

								            tmp2 = ptrV[partWidth*2];

								            tmp6 = *ptrV++;


								            tmp7 = tmp4 + tmp1;

								            tmp2 -= (tmp7 << 2);

								            tmp2 -= tmp7;

								            tmp2 += 512;

								            tmp7 = tmp5 + tmp6;

								            tmp3 = ptrC[partWidth*2];

								            tmp2 += (tmp7 << 4);

								            tmp2 += (tmp7 << 2);

								            tmp7 = ptrInt[partWidth*2];

								            tmp2 += tmp3;

								            tmp2 = clp[tmp2>>10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7>>5];

								            tmp1 += 512;

								            tmp2++;

								            mb[48] = (u8)((tmp7 + tmp2) >> 1);


								            tmp7 = tmp3 + tmp6;

								            tmp1 -= (tmp7 << 2);

								            tmp1 -= tmp7;

								            tmp7 = tmp4 + tmp5;

								            tmp2 = ptrC[partWidth];

								            tmp1 += (tmp7 << 4);

								            tmp1 += (tmp7 << 2);

								            tmp7 = ptrInt[partWidth];

								            tmp1 += tmp2;

								            tmp1 = clp[tmp1>>10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7>>5];

								            tmp6 += 512;

								            tmp1++;

								            mb[32] = (u8)((tmp7 + tmp1) >> 1);


								            tmp1 = *ptrC;

								            tmp7 = tmp2 + tmp5;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = tmp4 + tmp3;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = *ptrInt;

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6>>10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7>>5];

								            tmp5 += 512;

								            tmp6++;

								            mb[16] = (u8)((tmp7 + tmp6) >> 1);


								            tmp6 = ptrC[-(i32)partWidth];

								            tmp1 += tmp4;

								            tmp5 -= (tmp1 << 2);

								            tmp5 -= tmp1;

								            tmp3 += tmp2;

								            tmp5 += (tmp3 << 4);

								            tmp5 += (tmp3 << 2);

								            tmp7 = ptrInt[-(i32)partWidth];

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5>>10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7>>5];

								            tmp5++;

								            *mb++ = (u8)((tmp7 + tmp5) >> 1);

								            ptrC++;

								            ptrInt++;

								        }

								        mb += 4*16 - partWidth;

								        ptrC += 3*partWidth;

								        ptrV += 3*partWidth;

								        ptrInt += 3*partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdInterpolateMidHorQuarter


								        Functional description:

								          Function to perform horizontal and vertical interpolation of pixel

								          position 'i' or 'k' for a block. Overfilling is done only if needed.

								          Reference image (ref) is read at correct position and the predicted

								          part is written to macroblock array (mb)


								------------------------------------------------------------------------------*/


								void h264bsdInterpolateMidHorQuarter(

								  u8 *ref,

								  u8 *mb,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 partWidth,

								  u32 partHeight,

								  u32 horOffset)    /* 0 for pixel i, 1 for pixel k */

								{

								    u32 p1[21*21/4+1];

								    u32 x, y;

								    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

								    i32 *ptrJ, *ptrInt, *h1;

								    u8  *ptrC, *ptrV;

								    i32 table[21*16];

								    i32 tableWidth = (i32)partWidth+5;

								    const u8 *clp = h264bsdClip + 512;


								    /* Code */


								    ASSERT(ref);

								    ASSERT(mb);


								    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||

								        (y0 < 0) || ((u32)y0+partHeight+5 > height))

								    {

								        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,

								                partWidth+5, partHeight+5, partWidth+5);


								        x0 = 0;

								        y0 = 0;

								        ref = (u8*)p1;

								        width = partWidth+5;

								    }


								    ref += (u32)y0 * width + (u32)x0;


								    h1 = table + tableWidth;

								    ptrC = ref + width;

								    ptrV = ptrC + 5*width;


								    /* First step: calculate intermediate values for

								     * vertical interpolation */

								    for (y = (partHeight >> 2); y; y--)

								    {

								        for (x = (u32)tableWidth; x; x--)

								        {

								            tmp4 = ptrV[-(i32)width*2];

								            tmp5 = ptrV[-(i32)width];

								            tmp1 = ptrV[width];

								            tmp2 = ptrV[width*2];

								            tmp6 = *ptrV++;


								            tmp7 = tmp4 + tmp1;

								            tmp2 -= (tmp7 << 2);

								            tmp2 -= tmp7;

								            tmp7 = tmp5 + tmp6;

								            tmp3 = ptrC[width*2];

								            tmp2 += (tmp7 << 4);

								            tmp2 += (tmp7 << 2);

								            tmp2 += tmp3;

								            h1[tableWidth*2] = tmp2;


								            tmp7 = tmp3 + tmp6;

								            tmp1 -= (tmp7 << 2);

								            tmp1 -= tmp7;

								            tmp7 = tmp4 + tmp5;

								            tmp2 = ptrC[width];

								            tmp1 += (tmp7 << 4);

								            tmp1 += (tmp7 << 2);

								            tmp1 += tmp2;

								            h1[tableWidth] = tmp1;


								            tmp1 = *ptrC;

								            tmp7 = tmp2 + tmp5;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = tmp4 + tmp3;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp6 += tmp1;

								            *h1 = tmp6;


								            tmp6 = ptrC[-(i32)width];

								            tmp1 += tmp4;

								            tmp5 -= (tmp1 << 2);

								            tmp5 -= tmp1;

								            tmp3 += tmp2;

								            tmp5 += (tmp3 << 4);

								            tmp5 += (tmp3 << 2);

								            tmp5 += tmp6;

								            h1[-tableWidth] = tmp5;

								            h1++;

								            ptrC++;

								        }

								        ptrC += 4*width - partWidth - 5;

								        ptrV += 4*width - partWidth - 5;

								        h1 += 3*tableWidth;

								    }


								    /* Second step: calculate horizontal interpolation and average */

								    ptrJ = table + 5;

								    /* Pointer to integer sample position, either G or H */

								    ptrInt = table + 2 + horOffset;

								    for (y = partHeight; y; y--)

								    {

								        tmp6 = *(ptrJ - 5);

								        tmp5 = *(ptrJ - 4);

								        tmp4 = *(ptrJ - 3);

								        tmp3 = *(ptrJ - 2);

								        tmp2 = *(ptrJ - 1);

								        for (x = (partWidth>>2); x; x--)

								        {

								            /* First pixel */

								            tmp6 += 512;

								            tmp7 = tmp3 + tmp4;

								            tmp6 += (tmp7 << 4);

								            tmp6 += (tmp7 << 2);

								            tmp7 = tmp2 + tmp5;

								            tmp1 = *ptrJ++;

								            tmp6 -= (tmp7 << 2);

								            tmp6 -= tmp7;

								            tmp7 = *ptrInt++;

								            tmp6 += tmp1;

								            tmp6 = clp[tmp6 >> 10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7 >> 5];

								            tmp5 += 512;

								            tmp6++;

								            *mb++ = (u8)((tmp6 + tmp7) >> 1);

								            /* Second pixel */

								            tmp7 = tmp2 + tmp3;

								            tmp5 += (tmp7 << 4);

								            tmp5 += (tmp7 << 2);

								            tmp7 = tmp1 + tmp4;

								            tmp6 = *ptrJ++;

								            tmp5 -= (tmp7 << 2);

								            tmp5 -= tmp7;

								            tmp7 = *ptrInt++;

								            tmp5 += tmp6;

								            tmp5 = clp[tmp5 >> 10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7 >> 5];

								            tmp4 += 512;

								            tmp5++;

								            *mb++ = (u8)((tmp5 + tmp7) >> 1);

								            /* Third pixel */

								            tmp7 = tmp1 + tmp2;

								            tmp4 += (tmp7 << 4);

								            tmp4 += (tmp7 << 2);

								            tmp7 = tmp6 + tmp3;

								            tmp5 = *ptrJ++;

								            tmp4 -= (tmp7 << 2);

								            tmp4 -= tmp7;

								            tmp7 = *ptrInt++;

								            tmp4 += tmp5;

								            tmp4 = clp[tmp4 >> 10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7 >> 5];

								            tmp3 += 512;

								            tmp4++;

								            *mb++ = (u8)((tmp4 + tmp7) >> 1);

								            /* Fourth pixel */

								            tmp7 = tmp6 + tmp1;

								            tmp3 += (tmp7 << 4);

								            tmp3 += (tmp7 << 2);

								            tmp7 = tmp5 + tmp2;

								            tmp4 = *ptrJ++;

								            tmp3 -= (tmp7 << 2);

								            tmp3 -= tmp7;

								            tmp7 = *ptrInt++;

								            tmp3 += tmp4;

								            tmp3 = clp[tmp3 >> 10];

								            tmp7 += 16;

								            tmp7 = clp[tmp7 >> 5];

								            tmp3++;

								            *mb++ = (u8)((tmp3 + tmp7) >> 1);

								            tmp3 = tmp5;

								            tmp5 = tmp1;

								            tmp7 = tmp4;

								            tmp4 = tmp6;

								            tmp6 = tmp2;

								            tmp2 = tmp7;

								        }

								        ptrJ += 5;

								        ptrInt += 5;

								        mb += 16 - partWidth;

								    }


								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdPredictSamples


								        Functional description:

								          This function reconstructs a prediction for a macroblock partition.

								          The prediction is either copied or interpolated using the reference

								          frame and the motion vector. Both luminance and chrominance parts are

								          predicted. The prediction is stored in given macroblock array (data).

								        Inputs:

								          data          pointer to macroblock array (384 bytes) for output

								          mv            pointer to motion vector used for prediction

								          refPic        pointer to reference picture structure

								          xA            x-coordinate for current macroblock

								          yA            y-coordinate for current macroblock

								          partX         x-offset for partition in macroblock

								          partY         y-offset for partition in macroblock

								          partWidth     width of partition

								          partHeight    height of partition

								        Outputs:

								          data          macroblock array (16x16+8x8+8x8) where predicted

								                        partition is stored at correct position


								------------------------------------------------------------------------------*/


								void h264bsdPredictSamples(

								  u8 *data,

								  mv_t *mv,

								  image_t *refPic,

								  u32 xA,

								  u32 yA,

								  u32 partX,

								  u32 partY,

								  u32 partWidth,

								  u32 partHeight)


								{


								/* Variables */


								    u32 xFrac, yFrac, width, height;

								    i32 xInt, yInt;

								    u8 *lumaPartData;


								/* Code */


								    ASSERT(data);

								    ASSERT(mv);

								    ASSERT(partWidth);

								    ASSERT(partHeight);

								    ASSERT(refPic);

								    ASSERT(refPic->data);

								    ASSERT(refPic->width);

								    ASSERT(refPic->height);


								    /* luma */

								    lumaPartData = data + 16*partY + partX;


								    xFrac = mv->hor & 0x3;

								    yFrac = mv->ver & 0x3;


								    width = 16 * refPic->width;

								    height = 16 * refPic->height;


								    xInt = (i32)xA + (i32)partX + (mv->hor >> 2);

								    yInt = (i32)yA + (i32)partY + (mv->ver >> 2);


								    ASSERT(lumaFracPos[xFrac][yFrac] < 16);


								    switch (lumaFracPos[xFrac][yFrac])

								    {

								        case 0: /* G */

								            h264bsdFillBlock(refPic->data, lumaPartData,

								                    xInt,yInt,width,height,partWidth,partHeight,16);

								            break;

								        case 1: /* d */

								            h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,

								                    xInt, yInt-2, width, height, partWidth, partHeight, 0);

								            break;

								        case 2: /* h */

								            h264bsdInterpolateVerHalf(refPic->data, lumaPartData,

								                    xInt, yInt-2, width, height, partWidth, partHeight);

								            break;

								        case 3: /* n */

								            h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,

								                    xInt, yInt-2, width, height, partWidth, partHeight, 1);

								            break;

								        case 4: /* a */

								            h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt, width, height, partWidth, partHeight, 0);

								            break;

								        case 5: /* e */

								            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);

								            break;

								        case 6: /* i */

								            h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);

								            break;

								        case 7: /* p */

								            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 2);

								            break;

								        case 8: /* b */

								            h264bsdInterpolateHorHalf(refPic->data, lumaPartData,

								                    xInt-2, yInt, width, height, partWidth, partHeight);

								            break;

								        case 9: /* f */

								            h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);

								            break;

								        case 10: /* j */

								            h264bsdInterpolateMidHalf(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight);

								            break;

								        case 11: /* q */

								            h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);

								            break;

								        case 12: /* c */

								            h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt, width, height, partWidth, partHeight, 1);

								            break;

								        case 13: /* g */

								            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);

								            break;

								        case 14: /* k */

								            h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);

								            break;

								        default: /* case 15, r */

								            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,

								                    xInt-2, yInt-2, width, height, partWidth, partHeight, 3);

								            break;

								    }


								    /* chroma */

								    PredictChroma(

								      data + 16*16 + (partY>>1)*8 + (partX>>1),

								      xA + partX,

								      yA + partY,

								      partWidth,

								      partHeight,

								      mv,

								      refPic);


								}


								#else /* H264DEC_OMXDL */

								/*------------------------------------------------------------------------------


								    Function: h264bsdPredictSamples


								        Functional description:

								          This function reconstructs a prediction for a macroblock partition.

								          The prediction is either copied or interpolated using the reference

								          frame and the motion vector. Both luminance and chrominance parts are

								          predicted. The prediction is stored in given macroblock array (data).

								        Inputs:

								          data          pointer to macroblock array (384 bytes) for output

								          mv            pointer to motion vector used for prediction

								          refPic        pointer to reference picture structure

								          xA            x-coordinate for current macroblock

								          yA            y-coordinate for current macroblock

								          partX         x-offset for partition in macroblock

								          partY         y-offset for partition in macroblock

								          partWidth     width of partition

								          partHeight    height of partition

								        Outputs:

								          data          macroblock array (16x16+8x8+8x8) where predicted

								                        partition is stored at correct position


								------------------------------------------------------------------------------*/


								/*lint -e{550} Symbol 'res' not accessed */

								void h264bsdPredictSamples(

								  u8 *data,

								  mv_t *mv,

								  image_t *refPic,

								  u32 colAndRow,

								  u32 part,

								  u8 *pFill)


								{


								/* Variables */


								    u32 xFrac, yFrac;

								    u32 width, height;

								    i32 xInt, yInt, x0, y0;

								    u8 *partData, *ref;

								    OMXSize roi;

								    u32 fillWidth;

								    u32 fillHeight;

								    OMXResult res;

								    u32 xA, yA;

								    u32 partX, partY;

								    u32 partWidth, partHeight;


								/* Code */


								    ASSERT(data);

								    ASSERT(mv);

								    ASSERT(refPic);

								    ASSERT(refPic->data);

								    ASSERT(refPic->width);

								    ASSERT(refPic->height);


								    xA = (colAndRow & 0xFFFF0000) >> 16;

								    yA = (colAndRow & 0x0000FFFF);


								    partX = (part & 0xFF000000) >> 24;

								    partY = (part & 0x00FF0000) >> 16;

								    partWidth = (part & 0x0000FF00) >> 8;

								    partHeight = (part & 0x000000FF);


								    ASSERT(partWidth);

								    ASSERT(partHeight);


								    /* luma */

								    partData = data + 16*partY + partX;


								    xFrac = mv->hor & 0x3;

								    yFrac = mv->ver & 0x3;


								    width = 16 * refPic->width;

								    height = 16 * refPic->height;


								    xInt = (i32)xA + (i32)partX + (mv->hor >> 2);

								    yInt = (i32)yA + (i32)partY + (mv->ver >> 2);


								    x0 = (xFrac) ? xInt-2 : xInt;

								    y0 = (yFrac) ? yInt-2 : yInt;


								    if (xFrac)

								    {

								        if (partWidth == 16)

								            fillWidth = 32;

								        else

								            fillWidth = 16;

								    }

								    else

								        fillWidth = (partWidth*2);

								    if (yFrac)

								        fillHeight = partHeight+5;

								    else

								        fillHeight = partHeight;


								    if ((x0 < 0) || ((u32)x0+fillWidth > width) ||

								        (y0 < 0) || ((u32)y0+fillHeight > height))

								    {

								        h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,

								                fillWidth, fillHeight, fillWidth);


								        x0 = 0;

								        y0 = 0;

								        ref = pFill;

								        width = fillWidth;

								        if (yFrac)

								            ref += 2*width;

								        if (xFrac)

								            ref += 2;

								    }

								    else

								    {

								        /*lint --e(737) Loss of sign */

								        ref = refPic->data + yInt*width + xInt;

								    }

								    /* Luma interpolation */

								    roi.width = (i32)partWidth;

								    roi.height = (i32)partHeight;


								    res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,

								                                        (i32)xFrac, (i32)yFrac, roi);

								    ASSERT(res == 0);


								    /* Chroma */

								    width  = 8 * refPic->width;

								    height = 8 * refPic->height;


								    x0 = ((xA + partX) >> 1) + (mv->hor >> 3);

								    y0 = ((yA + partY) >> 1) + (mv->ver >> 3);

								    xFrac = mv->hor & 0x7;

								    yFrac = mv->ver & 0x7;


								    ref = refPic->data + 256 * refPic->width * refPic->height;


								    roi.width = (i32)(partWidth >> 1);

								    fillWidth = ((partWidth >> 1) + 8) & ~0x7;

								    roi.height = (i32)(partHeight >> 1);

								    fillHeight = (partHeight >> 1) + 1;


								    if ((x0 < 0) || ((u32)x0+fillWidth > width) ||

								        (y0 < 0) || ((u32)y0+fillHeight > height))

								    {

								        h264bsdFillBlock(ref, pFill, x0, y0, width, height,

								            fillWidth, fillHeight, fillWidth);

								        ref += width * height;

								        h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,

								            x0, y0, width, height, fillWidth,

								            fillHeight, fillWidth);


								        ref = pFill;

								        x0 = 0;

								        y0 = 0;

								        width = fillWidth;

								        height = fillHeight;

								    }


								    partData = data + 16*16 + (partY>>1)*8 + (partX>>1);


								    /* Chroma interpolation */

								    /*lint --e(737) Loss of sign */

								    ref += y0 * width + x0;

								    res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,

								                            (u32)roi.width, (u32)roi.height, xFrac, yFrac);

								    ASSERT(res == 0);

								    partData += 8 * 8;

								    ref += height * width;

								    res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,

								                            (u32)roi.width, (u32)roi.height, xFrac, yFrac);

								    ASSERT(res == 0);


								}


								#endif /* H264DEC_OMXDL */


								/*------------------------------------------------------------------------------


								    Function: FillRow1


								        Functional description:

								          This function gets a row of reference pels in a 'normal' case when no

								          overfilling is necessary.


								------------------------------------------------------------------------------*/


								static void FillRow1(

								  u8 *ref,

								  u8 *fill,

								  i32 left,

								  i32 center,

								  i32 right)

								{

								#ifndef FLASCC

								    ASSERT(ref);

								    ASSERT(fill);


								    memcpy(fill, ref, center);

								#else

								    int i = 0;

								    u8 *pdest = (u8*) fill;

								    u8 *psrc = (u8*) ref;

								    int loops = (center / sizeof(u32));


								    ASSERT(ref);

								    ASSERT(fill);


								    for(i = 0; i < loops; ++i)

								    {

								        *((u32*)pdest) = *((u32*)psrc);

								        pdest += sizeof(u32);

								        psrc += sizeof(u32);

								    }


								    loops = (center % sizeof(u32));

								    for (i = 0; i < loops; ++i)

								    {

								        *pdest = *psrc;

								        ++pdest;

								        ++psrc;

								    }

								#endif


								    /*lint -e(715) */

								}


								/*------------------------------------------------------------------------------


								    Function: h264bsdFillRow7


								        Functional description:

								          This function gets a row of reference pels when horizontal coordinate

								          is partly negative or partly greater than reference picture width

								          (overfilling some pels on left and/or right edge).

								        Inputs:

								          ref       pointer to reference samples

								          left      amount of pixels to overfill on left-edge

								          center    amount of pixels to copy

								          right     amount of pixels to overfill on right-edge

								        Outputs:

								          fill      pointer where samples are stored


								------------------------------------------------------------------------------*/

								#ifndef H264DEC_NEON

								void h264bsdFillRow7(

								  u8 *ref,

								  u8 *fill,

								  i32 left,

								  i32 center,

								  i32 right)

								{

								    u8 tmp = '\0';


								    ASSERT(ref);

								    ASSERT(fill);


								    if (left)

								        tmp = *ref;


								    for ( ; left; left--)

								        /*lint -esym(644,tmp)  tmp is initialized if used */

								        *fill++ = tmp;


								    for ( ; center; center--)

								        *fill++ = *ref++;


								    if (right)

								        tmp = ref[-1];


								    for ( ; right; right--)

								        /*lint -esym(644,tmp)  tmp is initialized if used */

								        *fill++ = tmp;

								}

								#endif

								/*------------------------------------------------------------------------------


								    Function: h264bsdFillBlock


								        Functional description:

								          This function gets a block of reference pels. It determines whether

								          overfilling is needed or not and repeatedly calls an appropriate

								          function (by using a function pointer) that fills one row the block.

								        Inputs:

								          ref               pointer to reference frame

								          x0                x-coordinate for block

								          y0                y-coordinate for block

								          width             width of reference frame

								          height            height of reference frame

								          blockWidth        width of block

								          blockHeight       height of block

								          fillScanLength    length of a line in output array (pixels)

								        Outputs:

								          fill              pointer to array where output block is written


								------------------------------------------------------------------------------*/


								void h264bsdFillBlock(

								  u8 *ref,

								  u8 *fill,

								  i32 x0,

								  i32 y0,

								  u32 width,

								  u32 height,

								  u32 blockWidth,

								  u32 blockHeight,

								  u32 fillScanLength)


								{


								/* Variables */


								    i32 xstop, ystop;

								    void (*fp)(u8*, u8*, i32, i32, i32);

								    i32 left, x, right;

								    i32 top, y, bottom;


								/* Code */


								    ASSERT(ref);

								    ASSERT(fill);

								    ASSERT(width);

								    ASSERT(height);

								    ASSERT(fill);

								    ASSERT(blockWidth);

								    ASSERT(blockHeight);


								    xstop = x0 + (i32)blockWidth;

								    ystop = y0 + (i32)blockHeight;


								    /* Choose correct function whether overfilling on left-edge or right-edge

								     * is needed or not */

								    if (x0 >= 0 && xstop <= (i32)width)

								        fp = FillRow1;

								    else

								        fp = h264bsdFillRow7;


								    if (ystop < 0)

								        y0 = -(i32)blockHeight;


								    if (xstop < 0)

								        x0 = -(i32)blockWidth;


								    if (y0 > (i32)height)

								        y0 = (i32)height;


								    if (x0 > (i32)width)

								        x0 = (i32)width;


								    xstop = x0 + (i32)blockWidth;

								    ystop = y0 + (i32)blockHeight;


								    if (x0 > 0)

								        ref += x0;


								    if (y0 > 0)

								        ref += y0 * (i32)width;


								    left = x0 < 0 ? -x0 : 0;

								    right = xstop > (i32)width ? xstop - (i32)width : 0;

								    x = (i32)blockWidth - left - right;


								    top = y0 < 0 ? -y0 : 0;

								    bottom = ystop > (i32)height ? ystop - (i32)height : 0;

								    y = (i32)blockHeight - top - bottom;


								    if (x0 >= 0 && xstop <= (i32)width)

								    {

								        for ( ; top; top-- )

								        {

								            FillRow1(ref, fill, left, x, right);

								            fill += fillScanLength;

								        }

								        for ( ; top; top-- )

								        {

								            FillRow1(ref, fill, left, x, right);

								        }

								        for ( ; y; y-- )

								        {

								            FillRow1(ref, fill, left, x, right);

								            ref += width;

								            fill += fillScanLength;

								        }

								    }

								    else

								    {

								        for ( ; top; top-- )

								        {

								            h264bsdFillRow7(ref, fill, left, x, right);

								            fill += fillScanLength;

								        }

								        for ( ; top; top-- )

								        {

								            h264bsdFillRow7(ref, fill, left, x, right);

								        }

								        for ( ; y; y-- )

								        {

								            h264bsdFillRow7(ref, fill, left, x, right);

								            ref += width;

								            fill += fillScanLength;

								        }

								    }

								    /* Top-overfilling */


								    /* Lines inside reference image */


								    ref -= width;


								    /* Bottom-overfilling */

								    for ( ; bottom; bottom-- )

								    {

								        //(*fp)(ref, fill, left, x, right);

								        if (x0 >= 0 && xstop <= (i32)width)

								            FillRow1(ref, fill, left, x, right);

								        else

								            h264bsdFillRow7(ref, fill, left, x, right);

								        fill += fillScanLength;

								    }

								}


								/*lint +e701 +e702 */