#include "headers.h"

/**
 * FUNCTION:
 *
 * void fdct_8x8(uint8* input_data, sint16* output_data, sint32 num_fdcts)
 * 
 * PARAMETERS:
 *
 * uint8* input_data   - uint8 pointer to macro block data for which the DCT
 *                       is performed
 *
 * sint16* output_data - sint16 pointer to output data of the DCT. Caller must
 *                       handle the memory allocation. The output_data must
 *                       *not* overlap the input_data!
 *
 * sint32 num_fdcts    - Indicates the number of 8x8 DCTs to be performed
 *                       i.e. this is the number of 8x8 components in the
 *                       input macroblock
 */


/* Original TI comment (partially outdated):
* ========================================================================= *
*                                                                           *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       fdct_8x8 -- 8x8 Block FDCT With Rounding, Endian Neutral            *
*                                                                           *
*   REVISION HISTORY                                                        *
*       20-May-1999 Initial handcode version                                *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C callable, and has the following C prototype:      *
*                                                                           *
*           void fdct_8x8(short fdct_data[], unsigned num_fdcts)            *
*                                                                           *
*       The fdct routine accepts a list of 8x8 pixel blocks and performs    *
*       FDCTs on each.  The array should be laid out identically to         *
*       "fdct_data[num_fdcts+1][8][8]".  All operations in this array are   *
*       performed entirely in-place.                                        *
*                                                                           *
*       Input values are stored in shorts, and may be in the range          *
*       [-512,511].  Larger input values may result in overflow,            *
*       although the 12-bit JPEG range [-1024,1023] overflows rarely.       *
*                                                                           *
*       This code requires '48 + 160 * num_fdcts' cycles to process         *
*       'num_fdcts' blocks, including function call overhead.  When         *
*       'num_fdcts' is zero, an early exit is taken and the function        *
*       runs for only 13 cycles (again, including call overhead).           *
*                                                                           *
*   DESCRIPTION                                                             *
*       The fdct_8x8 function implements a Chen FDCT.  Output values are    *
*       rounded, providing improved accuracy.  Input terms are expected     *
*       to be signed 11Q0 values, producing signed 15Q0 results.  (A        *
*       smaller dynamic range may be used on the input, producing a         *
*       correspondingly smaller output range.  Typical applications         *
*       include processing signed 9Q0 and unsigned 8Q0 pixel data,          *
*       producing signed 13Q0 or 12Q0 outputs, respectively.)  No           *
*       saturation is performed.                                            *
*/
       void fdct_8x8(uint8 *input_data, sint16 *output_data, sint32 num_fdcts)
       {
         /* -------------------------------------------------------- */
         /*  Set up the cosine coefficients c0..c7.                  */
         /* -------------------------------------------------------- */
         const uint16 c1 = 0x2C62, c3 = 0x25A0;
         const uint16 c5 = 0x1924, c7 = 0x08D4;
         const uint16 c0 = 0xB505, c2 = 0x29CF;
         const uint16 c6 = 0x1151;

         /* -------------------------------------------------------- */
         /*  Intermediate calculations.                              */
         /* -------------------------------------------------------- */
         sint16 f0, f1, f2, f3,
                f4, f5, f6, f7;       /* Spatial domain samples.      */
         sint32 g0, g1, h0, h1,
                p0, p1;               /* Even-half intermediate.      */
         sint16 r0, r1;               /* Even-half intermediate.      */
         sint32 P0, P1, R0, R1;       /* Even-half intermediate.      */
         sint16 g2, g3, h2, h3;       /* Odd-half intermediate.       */
         sint16 q0a,s0a,q0, q1,
                s0, s1;               /* Odd-half intermediate.       */
         sint16 Q0, Q1, S0, S1;       /* Odd-half intermediate.       */
         sint32 F0, F1, F2, F3,
                F4, F5, F6, F7;       /* Freq. domain results.        */
         sint32 F0r,F1r,F2r,F3r,
                F4r,F5r,F6r,F7r;      /* Rounded, truncated results.  */

         /* -------------------------------------------------------- */
         /*  Input and output pointers, loop control.                */
         /* -------------------------------------------------------- */
         sint32  i, j;
         uint8 *dct_i_ptr;
         sint16 *dct_o_ptr;

         /* -------------------------------------------------------- */
         /*  Outer vertical loop -- Process each 8x8 block.          */
         /* -------------------------------------------------------- */
         dct_i_ptr = input_data;
         dct_o_ptr = output_data;
         for (i = 0; i < num_fdcts; i++)
         {
             /* ---------------------------------------------------- */
             /*  Perform Vert 1-D FDCT on columns within each block. */
             /* ---------------------------------------------------- */
             for (j = 0; j < 8; j++)
             {
                 /* ------------------------------------------------ */
                 /*  Load the spatial-domain samples.                */
                 /* ------------------------------------------------ */
                 f0 = dct_i_ptr[ 0];
                 f1 = dct_i_ptr[ 8];
                 f2 = dct_i_ptr[16];
                 f3 = dct_i_ptr[24];
                 f4 = dct_i_ptr[32];
                 f5 = dct_i_ptr[40];
                 f6 = dct_i_ptr[48];
                 f7 = dct_i_ptr[56];
                 
                 #ifdef ENC_DEBUG
                 printf("vert. i: %x ", f0);
                 printf("%x ", f1);
                 printf("%x ", f2);
                 printf("%x ", f3);
                 printf("%x ", f4);
                 printf("%x ", f5);
                 printf("%x ", f6);
                 printf("%x ", f7);
                 #endif
                 
                 /* ------------------------------------------------ */
                 /*  Stage 1:  Separate into even and odd halves.    */
                 /* ------------------------------------------------ */
                 g0 = f0 + f7;               h2 = f0 - f7;
                 g1 = f1 + f6;               h3 = f1 - f6;
                 h1 = f2 + f5;               g3 = f2 - f5;
                 h0 = f3 + f4;               g2 = f3 - f4;

                 /* ------------------------------------------------ */
                 /*  Stage 2                                         */
                 /* ------------------------------------------------ */
                 p0 = g0 + h0;               r0 = g0 - h0;
                 p1 = g1 + h1;               r1 = g1 - h1;
                 q1 = g2;                    s1 = h2;

                 s0a= h3 + g3;               q0a= h3 - g3;
                 s0 = (s0a * c0 + 0x7FFF) >> 16;
                 q0 = (q0a * c0 + 0x7FFF) >> 16;

                 /* ------------------------------------------------ */
                 /*  Stage 3                                         */
                 /* ------------------------------------------------ */
                 P0 = p0 + p1;               P1 = p0 - p1;
                 R1 = c6 * r1 + c2 * r0;     R0 = c6 * r0 - c2 * r1;

                 Q1 = q1 + q0;               Q0 = q1 - q0;
                 S1 = s1 + s0;               S0 = s1 - s0;

                 /* ------------------------------------------------ */
                 /*  Stage 4                                         */
                 /* ------------------------------------------------ */
                 F0 = P0;                    F4 = P1;
                 F2 = R1;                    F6 = R0;

                 F1 = c7 * Q1 + c1 * S1;     F7 = c7 * S1 - c1 * Q1;
                 F5 = c3 * Q0 + c5 * S0;     F3 = c3 * S0 - c5 * Q0;

                 /* ------------------------------------------------ */
                 /*  Store the frequency domain results.             */
                 /* ------------------------------------------------ */
                 dct_o_ptr[ 0] = (sint16)F0;
                 dct_o_ptr[ 8] = (sint16)(F1 >> 13);
                 dct_o_ptr[16] = (sint16)(F2 >> 13);
                 dct_o_ptr[24] = (sint16)(F3 >> 13);
                 dct_o_ptr[32] = (sint16)F4;
                 dct_o_ptr[40] = (sint16)(F5 >> 13);
                 dct_o_ptr[48] = (sint16)(F6 >> 13);
                 dct_o_ptr[56] = (sint16)(F7 >> 13);
                 
                 #ifdef ENC_DEBUG
                 printf("o: %x ", (uint16)dct_o_ptr[0]);
                 printf("%x ", (uint16)dct_o_ptr[8]);
                 printf("%x ", (uint16)dct_o_ptr[16]);
                 printf("%x ", (uint16)dct_o_ptr[24]);
                 printf("%x ", (uint16)dct_o_ptr[32]);
                 printf("%x ", (uint16)dct_o_ptr[40]);
                 printf("%x ", (uint16)dct_o_ptr[48]);
                 printf("%x\r\n", (uint16)dct_o_ptr[56]);
                 #endif
                 
                 dct_i_ptr++;
                 dct_o_ptr++;
             }
             /* ---------------------------------------------------- */
             /*  Update pointer to next 8x8 FDCT block.              */
             /* ---------------------------------------------------- */
             dct_i_ptr += 56;
             dct_o_ptr += 56;
         }

         /* -------------------------------------------------------- */
         /*  Perform Horizontal 1-D FDCT on each 8x8 block.          */
         /* -------------------------------------------------------- */
         dct_o_ptr = output_data;
         for (i = 0; i < 8 * num_fdcts; i++)
         {
             /* ---------------------------------------------------- */
             /*  Load the spatial-domain samples.                    */
             /* ---------------------------------------------------- */
             f0 = dct_o_ptr[0];
             f1 = dct_o_ptr[1];
             f2 = dct_o_ptr[2];
             f3 = dct_o_ptr[3];
             f4 = dct_o_ptr[4];
             f5 = dct_o_ptr[5];
             f6 = dct_o_ptr[6];
             f7 = dct_o_ptr[7];
             
             #ifdef ENC_DEBUG
             printf("horis. i: %x ", f0);
             printf("%x ", f1);
             printf("%x ", f2);
             printf("%x ", f3);
             printf("%x ", f4);
             printf("%x ", f5);
             printf("%x ", f6);
             printf("%x ", f7);
             #endif
             
             /* ---------------------------------------------------- */
             /*  Stage 1:  Separate into even and odd halves.        */
             /* ---------------------------------------------------- */
             g0 = f0 + f7;               h2 = f0 - f7;
             g1 = f1 + f6;               h3 = f1 - f6;
             h1 = f2 + f5;               g3 = f2 - f5;
             h0 = f3 + f4;               g2 = f3 - f4;

             /* ---------------------------------------------------- */
             /*  Stage 2                                             */
             /* ---------------------------------------------------- */
             p0 = g0 + h0;               r0 = g0 - h0;
             p1 = g1 + h1;               r1 = g1 - h1;
             q1 = g2;                    s1 = h2;

             s0a= h3 + g3;               q0a= h3 - g3;
             q0 = (q0a * c0 + 0x7FFF) >> 16;
             s0 = (s0a * c0 + 0x7FFF) >> 16;

             /* ---------------------------------------------------- */
             /*  Stage 3                                             */
             /* ---------------------------------------------------- */
             P0 = p0 + p1;               P1 = p0 - p1;
             R1 = c6 * r1 + c2 * r0;     R0 = c6 * r0 - c2 * r1;

             Q1 = q1 + q0;               Q0 = q1 - q0;
             S1 = s1 + s0;               S0 = s1 - s0;

             /* ---------------------------------------------------- */
             /*  Stage 4                                             */
             /* ---------------------------------------------------- */
             F0 = P0;                    F4 = P1;
             F2 = R1;                    F6 = R0;

             F1 = c7 * Q1 + c1 * S1;     F7 = c7 * S1 - c1 * Q1;
             F5 = c3 * Q0 + c5 * S0;     F3 = c3 * S0 - c5 * Q0;

             /* ---------------------------------------------------- */
             /*  Round and truncate values.                          */
             /*                                                      */
             /*  Note: F0 and F4 have different rounding since no    */
             /*  MPYs have been applied to either term.  Also, F0's  */
             /*  rounding is slightly different to offset the        */
             /*  truncation effects from the horizontal pass (which  */
             /*  does not round).                                    */
             /* ---------------------------------------------------- */
             F0r = (F0 + 0x0006) >>  3;
             F1r = (F1 + 0x7FFF) >> 16;
             F2r = (F2 + 0x7FFF) >> 16;
             F3r = (F3 + 0x7FFF) >> 16;
             F4r = (F4 + 0x0004) >>  3;
             F5r = (F5 + 0x7FFF) >> 16;
             F6r = (F6 + 0x7FFF) >> 16;
             F7r = (F7 + 0x7FFF) >> 16;

             /* ---------------------------------------------------- */
             /*  Store the results                                   */
             /* ---------------------------------------------------- */
             dct_o_ptr[0] = (sint16)F0r;
             dct_o_ptr[1] = (sint16)F1r;
             dct_o_ptr[2] = (sint16)F2r;
             dct_o_ptr[3] = (sint16)F3r;
             dct_o_ptr[4] = (sint16)F4r;
             dct_o_ptr[5] = (sint16)F5r;
             dct_o_ptr[6] = (sint16)F6r;
             dct_o_ptr[7] = (sint16)F7r;
             
             #ifdef ENC_DEBUG
             printf("o: %x ", (uint16)dct_o_ptr[0]);
             printf("%x ", (uint16)dct_o_ptr[1]);
             printf("%x ", (uint16)dct_o_ptr[2]);
             printf("%x ", (uint16)dct_o_ptr[3]);
             printf("%x ", (uint16)dct_o_ptr[4]);
             printf("%x ", (uint16)dct_o_ptr[5]);
             printf("%x ", (uint16)dct_o_ptr[6]);
             printf("%x\r\n", (uint16)dct_o_ptr[7]);
             #endif
             
             /* ---------------------------------------------------- */
             /*  Update pointer to next FDCT row.                    */
             /* ---------------------------------------------------- */
             dct_o_ptr += 8;
         }
         
         #ifdef ENC_DEBUG
         printf("\r\n");
         #endif
         
         return;
       }