//#include <stdio.h>
//#include <smmintrin.h>
//
//int main ()
//{
//    __m128 a, b, c, d, e;
//
//    float arr1[] = {1, 1, 1, 1};
//    float arr2[] = {2, 2, 2, 2};
//    float arr3[] = {3, 3, 3, 3};
//    float arr4[] = {4, 4, 4, 4};
//
//    a = _mm_loadu_ps(arr1);
//    b = _mm_loadu_ps(arr2);
//    c = _mm_loadu_ps(arr3);
//    d = _mm_loadu_ps(arr4);
//
//    float r[4];
//
//    a = _mm_blend_ps(a, b, 0x2);
//    c = _mm_blend_ps(c, d, 0x8);
//    e = _mm_blend_ps(a, c, 0xC);
//
//    _mm_storeu_ps(r, e);
//
//    printf("%f %f %f %f\n", r[0], r[1], r[2], r[3]);
//
//    return 0;
//}

#include "Filtering.H"
#include <nrt/Core/Memory/Allocation.H>
#include <cmath>
#include <chrono>
#include <valgrind/callgrind.h>
#include <xmmintrin.h>
#include <smmintrin.h>
#include <cstring>



/* As convolve_sse_partial_unroll plus...
 *
 * We repeat the input data 4 times, with each repeat being shifted
 * by one sample from the previous repeat:
 * original: [0, 1, 2, 3, 4, 5, ...]
 *
 * repeat 1: [0, 1, 2, 3, 4, 5, ...]
 * repeat 2: [1, 2, 3, 4, 5, 6, ...]
 * repeat 3: [2, 3, 4, 5, 6, 7, ...]
 * repeat 4: [3, 4, 5, 6, 7, 8, ...]
 *
 * The effect of this is to create a set of arrays that encapsulate
 * a 16-byte alignment for every possible offset within the data.
 * Sample 0 is aligned in repeat 1, Sample 1 is aligned in repeat 1
 * etc. We then wrap around and sample 4 is aligned on repeat 1.
 *
 * The copies can be done fast with a memcpy.
 *
 * This means that in our unrolled inner-most loop, we can now do
 * an aligned data load (_mm_load_ps), speeding up the algorithm 
 * by ~2x.
 * */
int convolve_sse_in_aligned(float* in, float* out, int length,
        float* kernel, int kernel_length)
{
    float kernel_block[4] __attribute__ ((aligned (16)));
    float in_aligned[4][length] __attribute__ ((aligned (16)));

    __m128 kernel_reverse[kernel_length] __attribute__ ((aligned (16)));    
    __m128 data_block __attribute__ ((aligned (16)));

    __m128 prod __attribute__ ((aligned (16)));
    __m128 acc __attribute__ ((aligned (16)));

    // Repeat the kernel across the vector
    for(int i=0; i<kernel_length; i++){
        kernel_block[0] = kernel[kernel_length - i - 1];
        kernel_block[1] = kernel[kernel_length - i - 1];
        kernel_block[2] = kernel[kernel_length - i - 1];
        kernel_block[3] = kernel[kernel_length - i - 1];

        kernel_reverse[i] = _mm_load_ps(kernel_block);
    }

    /* Create a set of 4 aligned arrays
     * Each array is offset by one sample from the one before
     */
    for(int i=0; i<4; i++){
        memcpy(in_aligned[i], (in+i), (length-i)*sizeof(float));
    }

    for(int i=0; i<length-kernel_length; i+=4){

        acc = _mm_setzero_ps();

        for(int k=0; k<kernel_length; k+=4){

            int data_offset = i + k;

            for (int l = 0; l < 4; l++){

                data_block = _mm_load_ps(in_aligned[l] + data_offset);
                prod = _mm_mul_ps(kernel_reverse[k+l], data_block);

                acc = _mm_add_ps(acc, prod);
            }
        }
        _mm_storeu_ps(out+i, acc);

    }

    // Need to do the last value as a special case
    int i = length - kernel_length;
    out[i] = 0.0;
    for(int k=0; k<kernel_length; k++){
        out[i] += in_aligned[0][i+k] * kernel[kernel_length - k - 1];
    }

    return 0;
}








template<class Allocator>
void print(std::vector<float, Allocator> const & v)
{
  for(float x : v) 
  {
    if(x < 100) std::cout << " ";
    if(x < 10) std::cout << " ";
    std::cout << x << " ";
  }
  std::cout << std::endl;
}

int main()
{
  CALLGRIND_STOP_INSTRUMENTATION;
  
  planefiltering::AlignedVector v(2000);
  for(float & val : v) val = rand() / float(RAND_MAX);
  
  planefiltering::AlignedVector k(31);
  for(float & val : k) val = rand() / float(RAND_MAX);

  //std::vector<float> v = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
  //std::vector<float> k = {1,2,3};

  planefiltering::AlignedVector v_aligned(v.size());
  std::copy(v.begin(), v.end(), v_aligned.begin());

  planefiltering::AlignedVector k_aligned(k.size());
  std::copy(k.begin(), k.end(), k_aligned.begin());
  
  planefiltering::AlignedVector r_aligned(v.size(), 0.0);
  
  
  auto norm_start = std::chrono::high_resolution_clock::now();
  auto r = planefiltering::convolve(v, k);
  auto norm_end = std::chrono::high_resolution_clock::now();
  
  auto sse_start = std::chrono::high_resolution_clock::now();
  //CALLGRIND_START_INSTRUMENTATION;
  //for(size_t i=0; i<1000; ++i)
    planefiltering::convolveSSE(v_aligned.data(), v.size(), k_aligned.data(), k.size(), r_aligned.data());
  //CALLGRIND_STOP_INSTRUMENTATION;
  auto sse_end = std::chrono::high_resolution_clock::now();

  
  std::cout << std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(norm_end-norm_start).count() << "ms" << std::endl;
  std::cout << std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(sse_end-sse_start).count() << "ms" << std::endl;
  
  //auto res = std::mismatch(r.begin(), r.end(), r_aligned.begin(),
  //    [](float a, float b) {return std::abs(a-b) < 0.001;});
  //assert(res.first == r.end() && res.second == r_aligned.end());
  
  
  //for(size_t i=0; i<r.size(); ++i)
  //  if(std::abs(r[i]-r_aligned[i]) > 0.001)
  //    std::cout << r[i] << " != " << r_aligned[i] << std::endl;

  //print(k);
  //print(v);
  //print(r);
  //print(r_aligned);
}
