Add Vectors for Cuda Extension for Python

/* Import Libraries */
#include "Python.h"
#include "arrayobject.h"
#include <math.h>
 
/* Function Definitions */
extern "C" static PyObject *AddVector(PyObject *self, PyObject *args);

extern "C" void init_CU_AddVector() ;
int not_doublevector(PyArrayObject *vec);
__global__ void add_vector(float *v1, float *v2, float *result, int n);

 
/* Class Methods */
static PyMethodDef _MethodDefAddVector[] = {
	{"AddVector", AddVector, METH_VARARGS},
	{NULL, NULL}

};
 
/* Initialize C Extension */
void init_CU_AddVector()  
{
   (void) Py_InitModule("_CU_AddVector", _MethodDefAddVector);
   import_array();

}
 
/* 
 * Add two numpy vectors together (using CUDA)
 * - def AddVector( v1, v2 ) return result = v1+v2
 * - where v1 and v2 are 1d numpy vectors
 */
 static PyObject *AddVector(PyObject *self, PyObject *args)
 {
   /* Declare all variables */
   PyArrayObject *v1, *v2, *result;
   float *device_v1 = 0; float *device_v2 = 0; float *device_result = 0;
   float *c_v1 = 0; float *c_v2 = 0; float *c_result = 0; 
   double *c_double_v1 = 0; double *c_double_v2 = 0; 
   double *c_double_result = 0; 
   int i, dims[2], n, memory_size;

 
   /* Parse parameters and verify input */
   if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &v1, &PyArray_Type, &v2))  
      return NULL;
   if (NULL == v1 || not_doublevector(v1) )  return NULL;
   if (NULL == v2 || not_doublevector(v2) )  return NULL;
   if ( v1->dimensions[0] != v2->dimensions[0] ) return NULL;

 
   /* Create returned array */
   n = v1->dimensions[0];
   dims[0] = v1->dimensions[0];
   result = (PyArrayObject *) PyArray_SimpleNew(1,dims,NPY_DOUBLE);

 
   /* Pointer to array of double precision values */
   c_double_result = (double*)result->data;
   c_double_v1 = (double*)v1->data;
   c_double_v2 = (double*)v2->data;

 
   /* Allocate Memory on CPU */
   memory_size = n*sizeof(float);
   c_v1 = (float*)malloc(memory_size);
   c_v2 = (float*)malloc(memory_size);
   c_result = (float*)malloc(memory_size);

 
   if(c_v1 == 0 || c_v2 == 0 || c_result == 0 ) 
   {
      PyErr_SetString(PyExc_ValueError, "Couldn't allocate memory on CPU\n");
      return NULL;
   }

 
   /* move data from double array and cast to float array */
   for ( i=0; i < dims[0]; i++)  
   {

      c_v1[i] = c_double_v1[i];
      c_v2[i] = c_double_v2[i];
   }
 
   /* Allocate Memory on GPU for vector Addition */

   cudaMalloc((void**)&device_v1, memory_size );
   cudaMalloc((void**)&device_v2, memory_size );
   cudaMalloc((void**)&device_result, memory_size );
 
   if(device_v1 == 0 || device_v2 == 0 || device_result == 0 )

   {
      PyErr_SetString(PyExc_ValueError, "Couldn't allocate memory on GPU\n");
      return NULL;
   }
 
   /* Copy data in v1 and v2 to gpu memory */

   cudaMemcpy(device_v1 , c_v1, memory_size, cudaMemcpyHostToDevice);
   cudaMemcpy(device_v2, c_v2, memory_size, cudaMemcpyHostToDevice);
 
   /* Start Kernel */
   int block_size = 512;
   int grid_size = (n + block_size - 1) / block_size;
   add_vector<<<grid_size,block_size>>>(device_v1, device_v2, device_result, n);

 
   /* Move Result from GPU to numpy vector */
   cudaMemcpy(c_result, device_result, memory_size, cudaMemcpyDeviceToHost);
 
   /* Free Memory **/
   cudaFree(device_v1);
   cudaFree(device_v2);
   cudaFree(device_result);

 
   /* Move float array to numpy array */
   for ( i=0; i < dims[0]; i++)  
      c_double_result[i] = (double)c_result[i];

 
   return PyArray_Return(result);
 }
 
/* Add Vector using CUDA Kernel */
__global__ void add_vector(float *v1, float *v2, float *result, int n)

{
   int i = blockIdx.x*blockDim.x + threadIdx.x;
   if( i < n)

   {
      result[i] = v1[i] + v2[i];
   }
}
 
/* Check that PyArrayObject is a double (Float) type and a vector */ 

int  not_doublevector(PyArrayObject *vec)  
{
   if (vec->descr->type_num != NPY_DOUBLE || vec->nd != 1)  
   {
      PyErr_SetString(PyExc_ValueError, 
              "Array must be of type Float and 1 dimensional (n).");
      return 1;  
   }

   return 0;
}