
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include <cassert>
#include <iostream>
#include "vlc.hpp"
#include <stdarg.h>
#include <fstream>
CUdevice    device;
CUmodule    cudaModule;
CUcontext   context;
CUfunction  function;

  void checkCudaErrors(CUresult err) {
    assert(err == CUDA_SUCCESS);
  }
  VLC_Array<int>map_c0(...){

checkCudaErrors(cuCtxCreate(&context, 0, device));

std::ifstream add("add.ptx");
std::ifstream map_ptx0("map_ptx0.ptx");

std::string add_str((std::istreambuf_iterator<char>(add)), std::istreambuf_iterator<char>());
std::string map_ptx0_str((std::istreambuf_iterator<char>(map_ptx0)), std::istreambuf_iterator<char>());

map_ptx0_str = add_str +"\n" + map_ptx0_str;

checkCudaErrors(cuModuleLoadDataEx(&cudaModule,map_ptx0_str.c_str(), 0, 0, 0));
checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "map_ptx0"));

size_t num_constants = 1;
size_t num_input_arrays = 2;

int host_ptr3;
int* host_ptr1;
int* host_ptr2;
int* host_ptr0;

CUdeviceptr scale;
CUdeviceptr dev_ptr1;
CUdeviceptr dev_ptr2;
CUdeviceptr dev_ptr0;

va_list constants;
va_start(constants,num_constants);
for(int i = 0; i < num_constants; i++){

if(i ==0){
	host_ptr3 = va_arg(constants,int);
	checkCudaErrors(cuMemAlloc(&scale, sizeof(int)*1));
	checkCudaErrors(cuMemcpyHtoD(scale, &host_ptr3, sizeof(int)*1));

}

}
va_end(constants);

for(int i =0;i < num_input_arrays; i++){

if(i ==0){
	VLC_Array<int> tmp0 = va_arg(constants,VLC_Array<int>);
	host_ptr1 = tmp0.get_values();
}
else{
	VLC_Array<int> tmp1 = va_arg(constants,VLC_Array<int>);
	host_ptr2 = tmp1.get_values();
}

}
checkCudaErrors(cuMemAlloc(&dev_ptr1, sizeof(int)*5));
checkCudaErrors(cuMemAlloc(&dev_ptr2, sizeof(int)*5));
checkCudaErrors(cuMemcpyHtoD(dev_ptr1, host_ptr1, sizeof(int)*5));
checkCudaErrors(cuMemcpyHtoD(dev_ptr2, host_ptr2, sizeof(int)*5));



checkCudaErrors(cuMemAlloc(&dev_ptr0, sizeof(int)*5));
checkCudaErrors(cuMemcpyHtoD(dev_ptr0, host_ptr0, sizeof(int)*5));


int* ARRAY_LENGTH;
*ARRAY_LENGTH=5;
void *KernelParams[] = { &dev_ptr1, &dev_ptr2, &dev_ptr0, ARRAY_LENGTH};

unsigned int blockSizeX = 16;
unsigned int blockSizeY = 1;
unsigned int blockSizeZ = 1;
unsigned int gridSizeX = 1;
unsigned int gridSizeY = 1;
unsigned int gridSizeZ = 1;

checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ,0, NULL, KernelParams, NULL));

checkCudaErrors(cuMemcpyDtoH(host_ptr0,dev_ptr0, sizeof(int)*5));

checkCudaErrors(cuMemFree(dev_ptr1));
checkCudaErrors(cuMemFree(dev_ptr2));
checkCudaErrors(cuMemFree(dev_ptr0));
checkCudaErrors(cuMemFree(scale));

checkCudaErrors(cuModuleUnload(cudaModule));
checkCudaErrors(cuCtxDestroy(context));
int* dimensions = (int *) malloc(sizeof(int)*1);
*dimensions = 1;
return VLC_Array<int>( (size_t)5,host_ptr0, (size_t) 1, (size_t *) dimensions);
}


int vlc(){
VLC_Array<int> a=VLC_Array(5,1,6,5,1,2,3,4,5);
VLC_Array<int> b=VLC_Array(5,1,6,5,1,2,3,4,5);
VLC_Array<int> c=VLC_Array(5,1,6,5,1,2,3,4,5);
VLC_Array<int> d=map_c0(a,b);
printf(d);
return 1;
}


int main(void) { return vlc(); }