光线追踪

    xiaoxiao2021-04-15  44

    书上讲常量内存的那章有个用光线追踪画球的东西。 暂时没用常量内存实现了一下。 不得已自己写了个mvec3结构体

    #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <fstream> #include <cmath> #include "myGL.h" using namespace std; const GLuint WIDTH = 1280; const GLuint HEIGHT = 960; __device__ const float INF = 123456.0f; __device__ const float SINF = 123455.0f; struct mvec3 { float x, y, z; __device__ mvec3(float _x = 0, float _y = 0, float _z = 0) : x(_x), y(_y), z(_z) {} __device__ float length()const { return sqrtf(x*x+y*y+z*z); } __device__ float square()const { return x*x + y*y + z*z; } __device__ mvec3 operator + (const mvec3 &t) const { return mvec3(x+t.x, y+t.y, z+t.z); } __device__ mvec3 operator - (const mvec3 &t) const { return mvec3(x-t.x, y-t.y, z-t.z); } __device__ mvec3 operator * (float t) const { return mvec3(x*t, y*t, z*t); } __device__ mvec3 operator / (float t) const { return mvec3(x/t, y/t, z/t); } __device__ friend float dot(const mvec3 &a, const mvec3 &b) { return a.x*b.x + a.y*b.y + a.z*b.z; } __device__ friend mvec3 cross(const mvec3 &a, const mvec3 &b) { return mvec3( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x ); } __device__ friend mvec3 normalize(const mvec3 &a) { return a / a.length(); } }; struct Sphere { mvec3 site; float r; unsigned char rgb[4];//only rgb!!! void readData(ifstream &is) { is >> site.x >> site.y >> site.z >> r; int s; for (int i = 0; i < 3; ++i) { is >> s; rgb[i] = (unsigned char)s; } rgb[3] = 0; } __device__ float calc(const mvec3 &ray) const { float g = dot(ray, site); float lh = (site - ray * g).square(); if (lh >= r * r) return INF; return sqrtf(site.square() - lh) - sqrtf(r * r - lh); } __device__ float calcCos(float dis, const mvec3 &ray) const { mvec3 rToC = normalize(site - ray * dis);//the radius point to center of sphere 's normalized vector return dot(rToC, ray); } }; void HANDLE_ERROR(cudaError_t status); void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height); void ReadData(Sphere* &res, int &n) { ifstream is("sphere.in"); is >> n; cout << "Get " << n << " spheres." << endl; res = new Sphere[n]; for (int i = 0; i < n; ++i) res[i].readData(is); } int main() { Sphere *sp; int spnum; ReadData(sp, spnum); unsigned char *p = new unsigned char[WIDTH*HEIGHT * 4]; GLFWwindow *window = glfwStart(WIDTH, HEIGHT, "ray-tracing"); Shader shader; shader.mkShader("shader.vert", NULL, "shader.frag"); GLuint vao = mkVAO(); drawPixels(p, sp, spnum, WIDTH, HEIGHT); GLuint tex = mkTex(GL_RGBA, WIDTH, HEIGHT, p); while (!glfwWindowShouldClose(window)) { glfwPollEvents(); glClearColor(0,0,0,0); glClear(GL_COLOR_BUFFER_BIT); shader.Use(); glBindTexture(GL_TEXTURE_2D, tex); glBindVertexArray(vao); glDrawArrays(GL_TRIANGLES, 0, 6); glfwSwapBuffers(window); GLuint err = glGetError(); if (err) cout << "Error: " << err << endl; } glDeleteTextures(1, &tex); delete[]p; delete[]sp; glfwTerminate(); return 0; } void HANDLE_ERROR(cudaError_t status) { if (status != cudaSuccess) { fprintf(stderr, "Error~\n"); exit(0); } } __global__ void kernel(unsigned char *res, Sphere *sp, int spnum, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int offset = y * width + x; if (offset < width * height) { offset *= 4; float fx = 1.0f * (width/2 - x) / height; float fy = 1.0f * y / height - 0.5f; mvec3 ray = normalize(mvec3(fx, fy, 1)); float miniLen = SINF; int miniNum = -1; for (int i = 0; i < spnum; ++i) { float gg = sp[i].calc(ray); if (gg < miniLen) { miniLen = gg; miniNum = i; } } if (miniNum == -1) res[offset] = res[offset + 1] = res[offset + 2] = 0; else { unsigned char *rgb = sp[miniNum].rgb; float light = sp[miniNum].calcCos(miniLen, ray); res[offset] = light * rgb[0]; res[offset + 1] = light * rgb[1]; res[offset + 2] = light * rgb[2]; } } else res[offset] = res[offset + 1] = res[offset + 2] = 0; res[offset + 3] = 0; } void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height) { HANDLE_ERROR(cudaSetDevice(0)); cudaError_t status; unsigned char *p = 0; Sphere *pp = 0; status = cudaMalloc((void**)&pp, spnum * sizeof(Sphere)); if (status != cudaSuccess) { fprintf(stderr, "ERROR: Malloc for Sphere failed\n."); goto Error; } status = cudaMalloc((void**)&p, width*height*4); if (status != cudaSuccess) { fprintf(stderr, "ERROR: Malloc for Sphere failed\n."); goto Error; } status = cudaMemcpy(pp, sp, spnum * sizeof(Sphere), cudaMemcpyHostToDevice); if (status != cudaSuccess) { fprintf(stderr, "ERROR: Memcpy for Sphere failed\n."); goto Error; } dim3 blockDim(32,32); dim3 gridDim((width + 31) / 32, (height + 31) / 32); kernel << <gridDim, blockDim >> > (p, pp, spnum, width, height); status = cudaGetLastError(); if (status != cudaSuccess) { fprintf(stderr, "Build kernel failed.\n"); goto Error; } status = cudaDeviceSynchronize(); if (status != cudaSuccess) { fprintf(stderr, "kernel run failed.\n"); goto Error; } status = cudaMemcpy(res, p, width*height*4, cudaMemcpyDeviceToHost); if (status != cudaSuccess) { fprintf(stderr, "Memcpy failed.\n"); goto Error; } Error: cudaFree(p); cudaFree(pp); HANDLE_ERROR(cudaDeviceReset()); return ; }

    sphere.in文件

    2 0 0 10 3 255 0 0 3 3 10 3 0 255 0

    也就是站在(0,0,0)点像z轴正方向看了 效果图:

    换成用常量内存存储Spheres,然后球数目增加到了200,渲染2560*1920的图 实测用常量内存还是全局内存速度都几乎一样都是1170ms 倒是如果在每一个线程束里用__shared__复制一遍Spheres数组能够将时间缩减到1080ms 差不多9%的性能提升 于是用__constant__存储球然后再加个球与光线的判定优化勉强达到了970ms 于是最后达不到书上所说的近50%的性能提升,这个日后再细究。。 以及:感觉我的代码内存泄漏有点严重Orz

    #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <fstream> #include <cmath> #include "myGL.h" using namespace std; const GLuint WIDTH = 1280; const GLuint HEIGHT = 960; __device__ const float INF = 123456.0f; __device__ const float SINF = 123455.0f; struct mvec3 { float x, y, z; __device__ friend mvec3 Mvec3(float _x = 0, float _y = 0, float _z = 0) { mvec3 res; res.x = _x; res.y = _y; res.z = _z; return res; } __device__ float length()const { return sqrtf(x*x+y*y+z*z); } __device__ float square()const { return x*x + y*y + z*z; } __device__ mvec3 operator + (const mvec3 &t) const { return Mvec3(x+t.x, y+t.y, z+t.z); } __device__ mvec3 operator - (const mvec3 &t) const { return Mvec3(x-t.x, y-t.y, z-t.z); } __device__ mvec3 operator * (const float &t) const { return Mvec3(x*t, y*t, z*t); } __device__ mvec3 operator / (const float &t) const { return Mvec3(x/t, y/t, z/t); } __device__ friend float dot(const mvec3 &a, const mvec3 &b) { return a.x*b.x + a.y*b.y + a.z*b.z; } __device__ friend mvec3 cross(const mvec3 &a, const mvec3 &b) { return Mvec3( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x ); } __device__ friend mvec3 normalize(const mvec3 &a) { return a / a.length(); } }; struct Sphere { mvec3 site; float r; unsigned char rgb[4];//only rgb!!! void readData(ifstream &is) { is >> site.x >> site.y >> site.z >> r; int s; for (int i = 0; i < 3; ++i) { is >> s; rgb[i] = (unsigned char)s; } rgb[3] = 0; } __device__ float calc(const mvec3 &ray, const float &nowLen) const { if (site.square() - r > nowLen) return INF; float g = dot(ray, site); float lh = (site - ray * g).square(); if (lh >= r * r) return INF; return sqrtf(site.square() - lh) - sqrtf(r * r - lh); } __device__ float calcCos(const float &dis, const mvec3 &ray) const { mvec3 rToC = normalize(site - ray * dis);//the radius point to center of sphere 's normalized vector return dot(rToC, ray); } }; void HANDLE_ERROR(cudaError_t status); void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height); void ReadData(Sphere* &res, int &n) { ifstream is("sphere.in"); is >> n; cout << "Get " << n << " spheres." << endl; res = new Sphere[n]; for (int i = 0; i < n; ++i) res[i].readData(is); } int main() { Sphere *sp; int spnum; ReadData(sp, spnum); unsigned char *p = new unsigned char[WIDTH*HEIGHT * 4 * 4]; GLFWwindow *window = glfwStart(WIDTH, HEIGHT, "ray-tracing"); Shader shader; shader.mkShader("shader.vert", NULL, "shader.frag"); GLuint vao = mkVAO(); drawPixels(p, sp, spnum, WIDTH*2, HEIGHT*2); GLuint tex = mkTex(GL_RGBA, WIDTH*2, HEIGHT*2, p); while (!glfwWindowShouldClose(window)) { glfwPollEvents(); glClearColor(0,0,0,0); glClear(GL_COLOR_BUFFER_BIT); shader.Use(); glBindTexture(GL_TEXTURE_2D, tex); glBindVertexArray(vao); glDrawArrays(GL_TRIANGLES, 0, 6); glfwSwapBuffers(window); GLuint err = glGetError(); if (err) cout << "Error: " << err << endl; } glDeleteTextures(1, &tex); delete[]p; delete[]sp; glfwTerminate(); return 0; } __constant__ Sphere pp[200]; void HANDLE_ERROR(cudaError_t status) { if (status != cudaSuccess) { fprintf(stderr, "Error~\n"); exit(0); } } __global__ void kernel(unsigned char *res, int spnum, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int offset = y * width + x; if (offset < width * height) { offset *= 4; float fx = 1.0f * (width/2 - x) / height; float fy = 1.0f * y / height - 0.5f; mvec3 ray = normalize(Mvec3(fx, fy, 1)); float miniLen = SINF; int miniNum = -1; for (int i = 0; i < spnum; ++i) { float gg = pp[i].calc(ray, miniLen); if (gg < miniLen) { miniLen = gg; miniNum = i; } } if (miniNum == -1) res[offset] = res[offset + 1] = res[offset + 2] = 0; else { unsigned char *rgb = pp[miniNum].rgb; float light = pp[miniNum].calcCos(miniLen, ray); res[offset] = light * rgb[0]; res[offset + 1] = light * rgb[1]; res[offset + 2] = light * rgb[2]; } } else res[offset] = res[offset + 1] = res[offset + 2] = 0; res[offset + 3] = 0; } void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height) { HANDLE_ERROR(cudaSetDevice(0)); cudaError_t status; unsigned char *p = 0; status = cudaMalloc((void**)&p, width*height*4); if (status != cudaSuccess) { fprintf(stderr, "ERROR: Malloc for pixels failed\n."); goto Error; } status = cudaMemcpyToSymbol(pp, sp, spnum*sizeof(Sphere)); if (status != cudaSuccess) { fprintf(stderr, "ERROR: MemcpyToSymbol failed.\n"); goto Error; } dim3 blockDim(32,32); dim3 gridDim((width + 31) / 32, (height + 31) / 32); kernel << <gridDim, blockDim >> > (p, spnum, width, height); status = cudaGetLastError(); if (status != cudaSuccess) { fprintf(stderr, "Build kernel failed.\n"); goto Error; } status = cudaDeviceSynchronize(); if (status != cudaSuccess) { fprintf(stderr, "kernel run failed.\n"); goto Error; } status = cudaMemcpy(res, p, width*height*4, cudaMemcpyDeviceToHost); if (status != cudaSuccess) { fprintf(stderr, "Memcpy failed.\n"); goto Error; } Error: cudaFree(p); cudaFree(pp); HANDLE_ERROR(cudaDeviceReset()); return ; }

    效果图:

    转载请注明原文地址: https://ju.6miu.com/read-671179.html

    最新回复(0)