#include <iostream>
#ifdef LINUX
#include <pmmintrin.h>
#else
#include<intrin.h>
#endif

typedef unsigned char int8;


int8 sat(int a)
{
	if(a > 255) return 255;
	else if(a < 0) return 0;
	return a;
}

//SSE add API
int8* add(int8 *dst, const int8* src1, const int8 *src2, int size)
{
	int count = size / 16 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_adds_epu8(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1[i] + src2[i]);

	return dst;
}

int8* add(int8 * dst, const int8* src1, const int8 src2, int size)
{
	int count = size / 16;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_adds_epu8(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1[i] + src2);

	return dst;
}
int8* add(int8 *dst, const int8 src1, const int8* src2, int size)
{
	return add(dst, src2, src1, size);
}
int* add(int *dst, const int* src1, const int *src2, int size)
{
	int count = size / 4 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_add_epi32(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] + src2[i];

	return dst;
}

int* add(int * dst, const int* src1, const int src2, int size)
{
	int count = size / 4;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_add_epi32(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] + src2;

	return dst;
}

int* add(int *dst, const int src1, const int* src2, int size)
{
	return add(dst, src2, src1, size);
}

float* add(float *dst, const float* src1, const float *src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	const float *s1 = src1, *s2 = src2;
	float *d = dst;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_ps(s1); 
		XMM2 = _mm_loadu_ps(s2); 		
		XMM1 = _mm_add_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s1+=4; s2+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] + src2[i];

	return dst;
}

float* add(float *dst, const float* src1, const float src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	XMM2 = _mm_set1_ps(src2);
	const float *s1 = src1;
	float *d = dst;
	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_ps(s1); 
		XMM1 = _mm_add_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s1+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] + src2;

	return dst;
}

float* add(float *dst, const float src1, const float *src2, int size)
{
	add(dst, src2, src1, size);
}


//SSE sub API
int8* sub(int8 *dst, const int8* src1, const int8 *src2, int size)
{
	int count = size / 16 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_subs_epu8(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1[i] - src2[i]);

	return dst;
}

int8* sub(int8 * dst, const int8* src1, const int8 src2, int size)
{
	int count = size / 16;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_subs_epu8(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1[i] - src2);

	return dst;
}
int8* sub(int8 *dst, const int8 src1, const int8* src2, int size)
{
		int count = size / 16;

	__m128i* xmm1 = (__m128i*)src2;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src1);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_subs_epu8(XMM2, XMM1);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1 - src2[i]);

	return dst;
}
int* sub(int *dst, const int* src1, const int *src2, int size)
{
	int count = size / 4 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_sub_epi32(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] - src2[i];

	return dst;
}

int* sub(int * dst, const int* src1, const int src2, int size)
{
	int count = size / 4;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_sub_epi32(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] - src2;

	return dst;
}

int* sub(int *dst, const int src1, const int* src2, int size)
{
	int count = size / 4;

	__m128i* xmm1 = (__m128i*)src2;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src1);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_sub_epi32(XMM2, XMM1);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1 - src2[i];

	return dst;
}

float* sub(float *dst, const float* src1, const float *src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	const float *s1 = src1, *s2 = src2;
	float *d = dst;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_ps(s1); 
		XMM2 = _mm_loadu_ps(s2); 		
		XMM1 = _mm_sub_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s1+=4; s2+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] - src2[i];

	return dst;
}

float* sub(float *dst, const float* src1, const float src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	XMM2 = _mm_set1_ps(src2);
	const float *s1 = src1;
	float *d = dst;
	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_ps(s1); 
		XMM1 = _mm_sub_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s1+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] - src2;

	return dst;
}

float* sub(float *dst, const float src1, const float *src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	XMM1 = _mm_set1_ps(src1);
	const float *s2 = src2;
	float *d = dst;
	for(int i = 0; i< count; i = i++)
	{	
		XMM2 = _mm_loadu_ps(s2); 
		XMM1 = _mm_sub_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s2+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1 - src2[i];

	return dst;
}


//SSE "bitwise and" API
template <typename T>
T* bit_and(T *dst, const T* src1, const T *src2, int size)
{
	int count = size / (16/sizeof(T)) ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_and_si128(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src1[i] & src2[i];

	return dst;
}

template <typename T>
T* bit_and(T * dst, const T* src1, const T src2, int size)
{
	int count = size / (16/sizeof(T));

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_and_si128(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src1[i] & src2;

	return dst;
}

template <typename T>
T* bit_and(T *dst, const T src1, const T* src2, int size)
{
	return bit_and(dst, src2, src1, size);
}


//SSE "bitwise or" API
template <typename T>
T* bit_or(T *dst, const T* src1, const T *src2, int size)
{
	int count = size / (16/sizeof(T)) ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_or_si128(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src1[i] | src2[i];

	return dst;
}

template <typename T>
T* bit_or(T * dst, const T* src1, const T src2, int size)
{
	int count = size / (16/sizeof(T));

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_or_si128(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src1[i] | src2;

	return dst;
}

template <typename T>
T* bit_or(T *dst, const T src1, const T* src2, int size)
{
	return bit_or(dst, src2, src1, size);
}

//SSE "bitwise xor" API
template <typename T>
T* bit_xor(T *dst, const T* src1, const T *src2, int size)
{
	int count = size / (16/sizeof(T)) ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_xor_si128(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src1[i] ^ src2[i];

	return dst;
}

template <typename T>
T* bit_xor(T * dst, const T* src1, const T src2, int size)
{
	int count = size / (16/sizeof(T));

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_xor_si128(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src1[i] ^ src2;

	return dst;
}

template <typename T>
T* bit_xor(T *dst, const T src1, const T* src2, int size)
{
	return bit_xor(dst, src2, src1, size);
}


//TODO: SSE implementation
template <typename T>
T* copy(T *dst, T* src, int size)
{
	int count = size / (16/sizeof(T)) ;

	__m128i* xmm1 = (__m128i*)src;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm3++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(T)) ; i < size ; i++)
	dst[i] = src[i];

	return dst;
}


int8* copy(int8 *dst, const int src, int size)
{
	int count = size / (16/sizeof(int8)) ;

	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1 = _mm_set1_epi8((int8)src);

	for(int i = 0; i< count; i = i++)
	{	
		_mm_storeu_si128(xmm3, XMM1);
		xmm3++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(int8)) ; i < size ; i++)
	dst[i] = (int8)src;

	return dst;
}

float* copy(float *dst, const double src, int size)
{
	int count = size / (16/sizeof(float)) ;

	__m128 XMM1 = _mm_set1_ps((float)src);
    float *d = dst;
	for(int i = 0; i< count; i = i++)
	{	
		_mm_storeu_ps(d, XMM1);
		d = d + 4;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(float)) ; i < size ; i++)
	dst[i] = (float)src;

	return dst;
}


int* copy(int *dst, const int src, int size)
{
	int count = size / (16/sizeof(int)) ;

	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1 = _mm_set1_epi32((int)src);

	for(int i = 0; i< count; i = i++)
	{	
		_mm_storeu_si128(xmm3, XMM1);
		xmm3++;
	}
	//to handle fall back
	for(int i = count * (16/sizeof(int)) ; i < size ; i++)
	dst[i] = (int)src;
	return dst;
}

//template<typename T>
//void copy(T* a, double b, int size)
//{
//	for(int i = 0; i< size; i++)
//			a[i] = (T)b;
//	
//}

__m128i _mm_mul_epi32(const __m128i &src1, const __m128i &src2)
{
	__m128i hiSrc1, loSrc1;
	__m128i hiSrc2;
	__m128d tempS1, tempS2;
	
	hiSrc1 = _mm_srli_si128( src1, 8);
	hiSrc2 = _mm_srli_si128( src2, 8);

	tempS1 = _mm_cvtepi32_pd(src1);
	tempS2 = _mm_cvtepi32_pd(src2);
	tempS1 = _mm_mul_pd(tempS1, tempS2);
	
	loSrc1 = _mm_cvtpd_epi32(tempS1);

	tempS1 = _mm_cvtepi32_pd(hiSrc1);
	tempS2 = _mm_cvtepi32_pd(hiSrc2);
	tempS1 = _mm_mul_pd(tempS1, tempS2);
	
	hiSrc1 = _mm_cvtpd_epi32(tempS1);
	return _mm_or_si128(_mm_slli_si128( hiSrc1, 8), loSrc1);
}

__m128i _mm_mul_epi8(const __m128i &src1, const __m128i &src2)
{
	__m128i hiSrc1, loSrc1;
	__m128i hiSrc2, loSrc2;
	//__m128d tempS1, tempS2;
	
	hiSrc1 = _mm_unpackhi_epi8( src1, _mm_setzero_si128());
	hiSrc2 = _mm_unpackhi_epi8( src2, _mm_setzero_si128());

    hiSrc1	= _mm_mullo_epi16(hiSrc1, hiSrc2);

    loSrc1 = _mm_unpacklo_epi8( src1, _mm_setzero_si128());
	loSrc2 = _mm_unpacklo_epi8( src2, _mm_setzero_si128());

    loSrc1	= _mm_mullo_epi16(loSrc1, loSrc2);

	return _mm_packus_epi16 (loSrc1, hiSrc1);
}


int* mul(int *dst, const int* src1, const int *src2, int size)
{
	int count = size / 4 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_mul_epi32(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] * src2[i];

	return dst;
}

int* mul(int * dst, const int* src1, const int src2, int size)
{
	int count = size / 4;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)dst;
	__m128i XMM1, XMM2;
	XMM2 = _mm_set1_epi32(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_mul_epi32(XMM1, XMM2);
		_mm_storeu_si128(xmm2, XMM1);
		xmm1++; xmm2++;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] * src2;

	return dst;
}




int8* mul(int8 *dst, const int8 *src1, const int8 *src2, int size)
{
    int count = size / 16 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm2 = (__m128i*)src2;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM2 = _mm_loadu_si128(xmm2); 		
		XMM1 = _mm_mul_epi8(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm2++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1[i] * src2[i]);

	return dst;
}

int8* mul(int8 *dst, const int8 *src1, const int8 src2, int size)
{
    int count = size / 16 ;

	__m128i* xmm1 = (__m128i*)src1;
	__m128i* xmm3 = (__m128i*)dst;
	__m128i XMM1, XMM2;
    XMM2 = _mm_set1_epi8(src2);

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_si128(xmm1); 
		XMM1 = _mm_mul_epi8(XMM1, XMM2);
		_mm_storeu_si128(xmm3, XMM1);
		xmm1++; xmm3++;
	}
	//to handle fall back
	for(int i = count * 16 ; i < size ; i++)
	dst[i] = sat(src1[i] * src2);

	return dst;
}


float* mul(float *dst, const float* src1, const float *src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	const float *s1 = src1, *s2 = src2;
	float *d = dst;

	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_ps(s1); 
		XMM2 = _mm_loadu_ps(s2); 		
		XMM1 = _mm_mul_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s1+=4; s2+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] * src2[i];

	return dst;
}

float* mul(float *dst, const float* src1, const float src2, int size)
{
	int count = size / 4 ;

	__m128 XMM1, XMM2;
	XMM2 = _mm_set1_ps(src2);
	const float *s1 = src1;
	float *d = dst;
	for(int i = 0; i< count; i = i++)
	{	
		XMM1 = _mm_loadu_ps(s1); 
		XMM1 = _mm_mul_ps(XMM1, XMM2);
		_mm_storeu_ps(d, XMM1);
		s1+=4; d+=4;
	}
	//to handle fall back
	for(int i = count * 4 ; i < size ; i++)
	dst[i] = src1[i] * src2;

	return dst;
}


template <typename T>
T* mul(T *dst, const T *src1, const T *src2, int size)
{
	for(int i = 0; i< size; i++)
			dst[i] = src1[i] * src2[i];
}

template <typename T>
T* mul(T *dst, const T *src1, const T src2, int size)
{
	for(int i = 0; i< size; i++)
			dst[i] = src1[i] * src2;
}

template <typename T>
T* mul(T *dst, const T src1, const T* src2, int size)
{
	return mul(dst, src2, src1, size);
}

template <typename T>
T* div(T *dst, const T *src1, const T *src2, int size)
{
	for(int i = 0; i< size; i++)
			dst[i] = src1[i] / src2[i];
    return dst;
}

template <typename T>
T* div(T *dst, const T *src1, const T src2, int size)
{
	for(int i = 0; i< size; i++)
			dst[i] = src1[i] / src2;
    return dst;
}

template <typename T>
T* div(T *dst, const T src1, const T* src2, int size)
{
	for(int i = 0; i< size; i++)
			dst[i] = src2[i] / src1;
    return dst;
}

