1 cuda code #include <unistd.h>
2 cuda code #include <error.h>
3 cuda code #include <stdio.h>
4 cuda code #include <stdlib.h>
5 cuda code #include <errno.h>
6 cuda code #include <assert.h>
8 cuda code #include "components.h"
9 cuda code #include "common.h"
11 cuda code #define THREADS 256
13 cuda comment /* Store 3 RGB float components */
14 cuda code __device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
16 cuda code d_r[pos] = (r/255.0f) - 0.5f;
17 cuda code d_g[pos] = (g/255.0f) - 0.5f;
18 cuda code d_b[pos] = (b/255.0f) - 0.5f;
21 cuda comment /* Store 3 RGB intege components */
22 cuda code __device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
24 cuda code d_r[pos] = r - 128;
25 cuda code d_g[pos] = g - 128;
26 cuda code d_b[pos] = b - 128;
29 cuda comment /* Store float component */
30 cuda code __device__ void storeComponent(float *d_c, float c, int pos)
32 cuda code d_c[pos] = (c/255.0f) - 0.5f;
35 cuda comment /* Store integer component */
36 cuda code __device__ void storeComponent(int *d_c, int c, int pos)
38 cuda code d_c[pos] = c - 128;
41 cuda comment /* Copy img src data into three separated component buffers */
42 cuda code template<typename T>
43 cuda code __global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
44 cuda code unsigned char * d_src,
47 cuda code int x = threadIdx.x;
48 cuda code int gX = blockDim.x*blockIdx.x;
50 cuda code __shared__ unsigned char sData[THREADS*3];
52 cuda comment /* Copy data to shared mem by 4bytes other checks are not necessary, since d_src buffer is aligned to sharedDataSize */
53 cuda code if ( (x*4) < THREADS*3 ) {
54 cuda code float *s = (float *)d_src;
55 cuda code float *d = (float *)sData;
56 cuda code d[x] = s[((gX*3)>>2) + x];
58 cuda code __syncthreads();
62 cuda code int offset = x*3;
63 cuda code r = (T)(sData[offset]);
64 cuda code g = (T)(sData[offset+1]);
65 cuda code b = (T)(sData[offset+2]);
67 cuda code int globalOutputPosition = gX + x;
68 cuda code if (globalOutputPosition < pixels) {
69 cuda code storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
73 cuda comment /* Copy img src data into three separated component buffers */
74 cuda code template<typename T>
75 cuda code __global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
77 cuda code int x = threadIdx.x;
78 cuda code int gX = blockDim.x*blockIdx.x;
80 cuda code __shared__ unsigned char sData[THREADS];
82 cuda comment /* Copy data to shared mem by 4bytes other checks are not necessary, since d_src buffer is aligned to sharedDataSize */
83 cuda code if ( (x*4) < THREADS) {
84 cuda code float *s = (float *)d_src;
85 cuda code float *d = (float *)sData;
86 cuda code d[x] = s[(gX>>2) + x];
88 cuda code __syncthreads();
92 cuda code c = (T)(sData[x]);
94 cuda code int globalOutputPosition = gX + x;
95 cuda code if (globalOutputPosition < pixels) {
96 cuda code storeComponent(d_c, c, globalOutputPosition);
101 cuda comment /* Separate compoents of 8bit RGB source image */
102 cuda code template<typename T>
103 cuda code void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
105 cuda code unsigned char * d_src;
106 cuda code int pixels = width*height;
107 cuda code int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
109 cuda comment /* Alloc d_src buffer */
110 cuda code cudaMalloc((void **)&d_src, alignedSize);
111 cuda code cudaCheckAsyncError("Cuda malloc")
112 cuda code cudaMemset(d_src, 0, alignedSize);
114 cuda comment /* Copy data to device */
115 cuda code cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
116 cuda code cudaCheckError("Copy data to device")
118 cuda comment /* Kernel */
119 cuda code dim3 threads(THREADS);
120 cuda code dim3 grid(alignedSize/(THREADS*3));
121 cuda code assert(alignedSize%(THREADS*3) == 0);
122 cuda code c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
123 cuda code cudaCheckAsyncError("CopySrcToComponents kernel")
125 cuda comment /* Free Memory */
126 cuda code cudaFree(d_src);
127 cuda code cudaCheckAsyncError("Free memory")
129 cuda code template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
130 cuda code template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
133 cuda comment /* Copy a 8bit source image data into a color compoment of type T */
134 cuda code template<typename T>
135 cuda code void bwToComponent(T *d_c, unsigned char * src, int width, int height)
137 cuda code unsigned char * d_src;
138 cuda code int pixels = width*height;
139 cuda code int alignedSize = DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
141 cuda comment /* Alloc d_src buffer */
142 cuda code cudaMalloc((void **)&d_src, alignedSize);
143 cuda code cudaCheckAsyncError("Cuda malloc")
144 cuda code cudaMemset(d_src, 0, alignedSize);
146 cuda comment /* Copy data to device */
147 cuda code cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
148 cuda code cudaCheckError("Copy data to device")
150 cuda comment /* Kernel */
151 cuda code dim3 threads(THREADS);
152 cuda code dim3 grid(alignedSize/(THREADS));
153 cuda code assert(alignedSize%(THREADS) == 0);
154 cuda code c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
155 cuda code cudaCheckAsyncError("CopySrcToComponent kernel")
157 cuda comment /* Free Memory */
158 cuda code cudaFree(d_src);
159 cuda code cudaCheckAsyncError("Free memory")
162 cuda code template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
163 cuda code template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);