/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This sample program models formation of V-shaped flocks by big birds, * such as geese and cranes, as an example of simple AI. It demonstrates * that the CUDA-based implementation is much faster than a CPU-based one. */ #pragma warning(disable : 4312) #include #include #pragma warning(disable : 4996) // disable deprecated warning #include #pragma warning(default : 4996) #include #include #include #include #include #include // This header includes all the necessary D3D10 includes #include #include #include // includes, project #include #include #include #include "VFlockingD3D10.h" #define MAX_EPSILON 10 static char *SDK_name = "VFlockingD3D10"; bool g_bPassed = true; int g_iFrameToCompare = 1300; int *pArgc = NULL; char **pArgv = NULL; //----------------------------------------------------------------------------- // Global variables //----------------------------------------------------------------------------- IDXGIAdapter *g_pCudaCapableAdapter = NULL; // Adapter to use ID3D10Device *g_pd3dDevice = NULL; // Our rendering device IDXGISwapChain *g_pSwapChain = NULL; // The swap chain of the window ID3D10RenderTargetView *g_pSwapChainRTV = NULL; // The Render target view on the swap chain ( used for clear) ID3D10RasterizerState *g_pRasterState = NULL; ID3D10Buffer *g_pPositions = NULL; cudaGraphicsResource *g_pCudaResourcePos; ID3D10Buffer *g_pNewPositions = NULL; cudaGraphicsResource *g_pCudaResourceNewPos; ID3D10ShaderResourceView *g_pPositionsSRV = NULL; ID3D10InputLayout *g_pInputLayout = NULL; ID3D10Effect *g_pSimpleEffect = NULL; ID3D10EffectTechnique *g_pDrawQuadTechnique = NULL; ID3D10EffectTechnique *g_pDrawBirdsTechnique = NULL; ID3D10EffectVectorVariable *g_pvQuadRect = NULL; ID3D10EffectShaderResourceVariable *g_pTexture2D = NULL; static const char g_simpleEffectSrc[] = "Buffer g_BirdsPositions : register(t0); \n" "float4 g_vQuadRect; \n" "Texture2D g_Texture2D; \n" "\n" "SamplerState samLinear{ \n" " Filter = MIN_MAG_LINEAR_MIP_POINT; \n" "};\n" "\n" "struct Fragment{ \n" " float4 Pos : SV_POSITION;\n" " float3 Tex : TEXCOORD0; };\n" "\n" "Fragment VS( uint vertexId : SV_VertexID )\n" "{\n" " Fragment f;\n" " f.Tex = float3( 0.f, 0.f, 0.f); \n" " if (vertexId == 1) f.Tex.x = 1.f; \n" " else if (vertexId == 2) f.Tex.y = 1.f; \n" " else if (vertexId == 3) f.Tex.xy = float2(1.f, 1.f); \n" " \n" " f.Pos = float4( g_vQuadRect.xy + f.Tex * g_vQuadRect.zw, 0, 1);\n" " \n" " return f;\n" "}\n" "\n" "float4 PS( Fragment f ) : SV_Target\n" "{\n" " return g_Texture2D.Sample( samLinear, f.Tex.xy ); \n" " // return float4(f.Tex, 1);\n" "}\n" "\n" "float4 VSBird( uint VertexID : SV_VertexID ) : SV_Position \n" "{ \n" " float4 position = float4( 0, 0, 0.5, 1 ) ; \n" " \n" " int bn = VertexID / 3 ; \n" " int vn = VertexID % 3 ; \n" " \n" " float2 birdcenter = 0.0014 * g_BirdsPositions.Load(bn) - float2(-0.15, " "0.15) ; \n" " \n" " float wing = 0.12 ; \n" " switch(vn) \n" " { \n" " case 0 : \n" " position.x = birdcenter.x - wing ; \n" " position.y = birdcenter.y - 0.01 ; \n" " break ; \n" " case 1 : \n" " position.x = birdcenter.x + wing ; \n" " position.y = birdcenter.y - 0.01 ; \n" " break ; \n" " case 2 : \n" " position.x = birdcenter.x ; \n" " position.y = birdcenter.y + 0.005 ; \n" " break ; \n" " } \n" " \n" " position.z = 0.5; \n" " position.w = 1.0; \n" " \n" " return position ; \n" "} \n" " \n" "float4 PSBird( float4 input : SV_Position ) : SV_Target \n" "{ \n" " return float4( 1, 1, 1, 1 ); \n" "} \n" "RasterizerState NoCull \n" "{ \n" " CullMode = None; \n" "}; \n" "BlendState Opaque \n" "{ \n" " BlendEnable[0] = false; \n" "}; \n" " \n" "DepthStencilState DisableDepthTestWrite \n" "{ \n" " DepthEnable = FALSE; \n" " DepthWriteMask = 0; \n" "}; \n" " \n" "technique10 DrawBirds \n" "{ \n" " pass P0 \n" " { \n" " SetVertexShader( CompileShader( vs_4_0, VSBird() ) ); \n" " SetGeometryShader( NULL ); \n" " SetPixelShader( CompileShader( ps_4_0, PSBird() ) ); \n" " \n" " SetDepthStencilState( DisableDepthTestWrite, 1 ); \n" " SetBlendState(Opaque,float4(0,0,0,0),0xffffffff); \n" " SetRasterizerState(NoCull); \n" " } \n" "} \n" " \n" "technique10 Render\n" "{\n" " pass P0\n" " {\n" " SetVertexShader( CompileShader( vs_4_0, VS() ) );\n" " SetGeometryShader( NULL );\n" " SetPixelShader( CompileShader( ps_4_0, PS() ) );\n" " }\n" "}\n" "\n"; // testing/tracing function used pervasively in tests. If the condition is // unsatisfied // then spew and fail the function immediately (doing no cleanup) #define AssertOrQuit(x) \ if (!(x)) { \ fprintf(stdout, "Assert unsatisfied in %s at %s:%d\n", __FUNCTION__, \ __FILE__, __LINE__); \ return 1; \ } #ifndef SAFE_RELEASE #define SAFE_RELEASE(p) \ { \ if (p) { \ (p)->Release(); \ (p) = NULL; \ } \ } #endif bool g_runCPU = false; bool g_bDone = false; int g_seed = 126832513; // 1247227967 typedef unsigned int uint; const uint g_WindowWidth = 784; const uint g_WindowHeight = 784; // simulation parameters float alpha = 90.f; float upwashX = 30.f; float upwashY = 50.f; float wingspan = 50.f; float dX = .5f; float dY = .5f; float epsilon = 30.f; float lambda = -0.1073f * wingspan; // number of birds const uint nBirds = 25; // positions on host float2 *positions = NULL; float2 *new_positions = NULL; struct WingTip { float x; // x coordinate float y; // y coordinate int lr; // 1 if left, -1 if right }; struct Gap { float2 left; // left bordering point float2 right; // right bordering point }; struct ViewGoal { float2 pos; // x coordinate of a bird's goal when pursuing unobstructed view float dist; // distance }; WingTip *g_wingTips = NULL; uint2 *pairs = NULL; uint2 *d_pairs = NULL; uint3 *triples = NULL; uint3 *d_triples = NULL; bool *hasproxy = NULL; bool *d_hasproxy = NULL; bool *d_neighbors = NULL; bool *leftgoals = NULL; bool *d_leftgoals = NULL; bool *rightgoals = NULL; bool *d_rightgoals = NULL; Params *params = NULL; Params *d_params = NULL; // The CUDA kernel launchers that get called extern "C" void cuda_simulate(float2 *newPos, float2 *curPos, uint numBirds, bool *d_hasproxy, bool *d_neighbors, bool *d_leftgoals, bool *d_rightgoals, uint2 *d_pairs, uint3 *d_triples, Params *m_params); //----------------------------------------------------------------------------- // Forward declarations //----------------------------------------------------------------------------- HRESULT InitD3D(HWND hWnd); void DrawScene(); void Cleanup(); void Render(); float2 diff(float2 pos0, float2 pos1); float norm(float2 pos); float dist(float2 pos0, float2 pos1); bool isInsideQuad(float2 pos0, float2 pos1, float width, float height); void initialize(uint numBirds); void simulate(float2 *newPos, float2 *curPos, uint numBirds); LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam); bool findCUDADevice() { int nGraphicsGPU = 0; int deviceCount = 0; bool bFoundGraphics = false; char devname[256]; // This function call returns 0 if there are no CUDA capable devices. cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); exit(EXIT_FAILURE); } if (deviceCount == 0) { printf("> There are no device(s) supporting CUDA\n"); return false; } else { printf("> Found %d CUDA Capable Device(s)\n", deviceCount); } // Get CUDA device properties cudaDeviceProp deviceProp; for (int dev = 0; dev < deviceCount; ++dev) { cudaGetDeviceProperties(&deviceProp, dev); strcpy(devname, deviceProp.name); printf("> GPU %d: %s\n", dev, devname); } return true; } bool findDXDevice(char *dev_name) { HRESULT hr = S_OK; cudaError cuStatus; // Iterate through the candidate adapters IDXGIFactory *pFactory; hr = sFnPtr_CreateDXGIFactory(__uuidof(IDXGIFactory), (void **)(&pFactory)); if (!SUCCEEDED(hr)) { printf("> No DXGI Factory created.\n"); return false; } UINT adapter = 0; for (; !g_pCudaCapableAdapter; ++adapter) { // Get a candidate DXGI adapter IDXGIAdapter *pAdapter = NULL; hr = pFactory->EnumAdapters(adapter, &pAdapter); if (FAILED(hr)) { break; // no compatible adapters found } // Query to see if there exists a corresponding compute device int cuDevice; cuStatus = cudaD3D10GetDevice(&cuDevice, pAdapter); // This prints and resets the cudaError to cudaSuccess printLastCudaError("cudaD3D10GetDevice failed"); if (cudaSuccess == cuStatus) { // If so, mark it as the one against which to create our d3d10 device g_pCudaCapableAdapter = pAdapter; g_pCudaCapableAdapter->AddRef(); } pAdapter->Release(); } printf("> Found %d D3D10 Adapter(s).\n", (int)adapter); pFactory->Release(); if (!g_pCudaCapableAdapter) { printf("> Found 0 D3D10 Adapter(s) /w Compute capability.\n"); return false; } DXGI_ADAPTER_DESC adapterDesc; g_pCudaCapableAdapter->GetDesc(&adapterDesc); wcstombs(dev_name, adapterDesc.Description, 128); printf("> Found 1 D3D10 Adapter(s) /w Compute capability.\n"); printf("> %s\n", dev_name); return true; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", SDK_name); printf( "NOTE: The CUDA Samples are not meant for performance measurements. " "Results may vary when GPU Boost is enabled.\n\n"); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not // mean device is found) { printf("> D3D10 API libraries NOT found on.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } } // // create window // // Register the window class WNDCLASSEX wc = {sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CUDA SDK", NULL}; RegisterClassEx(&wc); // Create the application's window HWND hWnd = CreateWindow( wc.lpszClassName, "VFlocking", WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth, g_WindowHeight, GetDesktopWindow() /*NULL*/, NULL, wc.hInstance, NULL); ShowWindow(hWnd, SW_SHOWDEFAULT); UpdateWindow(hWnd); // Initialize Direct3D if (SUCCEEDED(InitD3D(hWnd))) { { // register the Direct3D resources that we'll use // we'll read to and write from g_texture_2d, so don't set any special map // flags for it cudaError_t error = cudaSuccess; error = cudaGraphicsD3D10RegisterResource( &g_pCudaResourcePos, g_pPositions, cudaGraphicsRegisterFlagsNone); getLastCudaError( "cudaGraphicsD3D10RegisterResource (g_texture_2d) failed"); error = cudaGraphicsResourceSetMapFlags(g_pCudaResourcePos, cudaD3D10MapFlagsWriteDiscard); getLastCudaError("cudaGraphicsResourceSetMapFlags (g_texture_2d) failed"); cudaGraphicsD3D10RegisterResource(&g_pCudaResourceNewPos, g_pNewPositions, cudaGraphicsRegisterFlagsNone); getLastCudaError( "cudaGraphicsD3D10RegisterResource (g_texture_2d) failed"); error = cudaGraphicsResourceSetMapFlags(g_pCudaResourceNewPos, cudaD3D10MapFlagsWriteDiscard); getLastCudaError("cudaGraphicsResourceSetMapFlags (g_texture_2d) failed"); } } srand(g_seed); // allocate device memory for positions checkCudaErrors( cudaMalloc((void **)&d_pairs, nBirds * (nBirds - 1) * sizeof(uint2) / 2)); checkCudaErrors( cudaMalloc((void **)&d_triples, nBirds * (nBirds - 1) * (nBirds - 2) * sizeof(uint3) / 6)); checkCudaErrors( cudaMalloc((void **)&d_neighbors, nBirds * nBirds * sizeof(bool))); checkCudaErrors( cudaMalloc((void **)&d_leftgoals, nBirds * nBirds * sizeof(bool))); checkCudaErrors( cudaMalloc((void **)&d_rightgoals, nBirds * nBirds * sizeof(bool))); checkCudaErrors(cudaMalloc((void **)&d_hasproxy, nBirds * sizeof(bool))); checkCudaErrors(cudaMalloc((void **)&d_params, sizeof(Params))); initialize(nBirds); g_pd3dDevice->UpdateSubresource(g_pPositions, 0, NULL, positions, 0, 0); // // the main loop // while (false == g_bDone) { Render(); // // handle I/O // MSG msg; ZeroMemory(&msg, sizeof(msg)); while (msg.message != WM_QUIT) { if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { TranslateMessage(&msg); DispatchMessage(&msg); } else { Render(); if (ref_file) { for (int count = 0; count < g_iFrameToCompare; count++) { Render(); } const char *cur_image_path = "VFlockingD3D10.ppm"; // Save a reference of our current test run image CheckRenderD3D10::ActiveRenderTargetToPPM(g_pd3dDevice, cur_image_path); // compare to official reference image, printing PASS or FAIL. g_bPassed = CheckRenderD3D10::PPMvsPPM(cur_image_path, ref_file, argv[0], MAX_EPSILON, 0.15f); g_bDone = true; PostQuitMessage(0); } else { g_bPassed = true; } } } }; // Unregister windows class UnregisterClass(wc.lpszClassName, wc.hInstance); // clean delete[] positions; delete[] new_positions; delete[] g_wingTips; // // and exit // printf("> %s running on %s exiting...\n", SDK_name, device_name); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); } //----------------------------------------------------------------------------- // Name: InitD3D() // Desc: Initializes Direct3D //----------------------------------------------------------------------------- HRESULT InitD3D(HWND hWnd) { // Set up the structure used to create the device and swapchain DXGI_SWAP_CHAIN_DESC sd; ZeroMemory(&sd, sizeof(sd)); sd.BufferCount = 1; sd.BufferDesc.Width = g_WindowWidth; sd.BufferDesc.Height = g_WindowHeight; sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; sd.BufferDesc.RefreshRate.Numerator = 60; sd.BufferDesc.RefreshRate.Denominator = 1; sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; sd.OutputWindow = hWnd; sd.SampleDesc.Count = 1; sd.SampleDesc.Quality = 0; sd.Windowed = TRUE; // Create device and swapchain HRESULT hr = sFnPtr_D3D10CreateDeviceAndSwapChain( g_pCudaCapableAdapter, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0, D3D10_SDK_VERSION, &sd, &g_pSwapChain, &g_pd3dDevice); AssertOrQuit(SUCCEEDED(hr)); g_pCudaCapableAdapter->Release(); // birds' buffer D3D10_BUFFER_DESC bdesc; memset(&bdesc, 0, sizeof(bdesc)); bdesc.Usage = D3D10_USAGE_DEFAULT; bdesc.ByteWidth = nBirds * sizeof(float2); // sizeof(D3DXVECTOR2) ; // bdesc.BindFlags = D3D10_BIND_SHADER_RESOURCE; bdesc.CPUAccessFlags = 0; bdesc.MiscFlags = 0; g_pd3dDevice->CreateBuffer(&bdesc, NULL, &g_pPositions); g_pd3dDevice->CreateBuffer(&bdesc, NULL, &g_pNewPositions); D3D10_SHADER_RESOURCE_VIEW_DESC rsvdesc; memset(&rsvdesc, 0, sizeof(rsvdesc)); rsvdesc.Buffer.ElementOffset = 0; rsvdesc.Buffer.ElementWidth = nBirds; rsvdesc.Format = DXGI_FORMAT_R32G32_FLOAT; rsvdesc.ViewDimension = D3D10_SRV_DIMENSION_BUFFER; g_pd3dDevice->CreateShaderResourceView(g_pPositions, &rsvdesc, &g_pPositionsSRV); // Create a render target view of the swapchain ID3D10Texture2D *pBuffer; hr = g_pSwapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (LPVOID *)&pBuffer); AssertOrQuit(SUCCEEDED(hr)); hr = g_pd3dDevice->CreateRenderTargetView(pBuffer, NULL, &g_pSwapChainRTV); AssertOrQuit(SUCCEEDED(hr)); pBuffer->Release(); g_pd3dDevice->OMSetRenderTargets(1, &g_pSwapChainRTV, NULL); // Setup the viewport D3D10_VIEWPORT vp; vp.Width = g_WindowWidth; vp.Height = g_WindowHeight; vp.MinDepth = 0.0f; vp.MaxDepth = 1.0f; vp.TopLeftX = 0; vp.TopLeftY = 0; g_pd3dDevice->RSSetViewports(1, &vp); // Setup the effect { ID3D10Blob *pCompiledEffect; ID3D10Blob *pErrors = NULL; hr = sFnPtr_D3D10CompileEffectFromMemory((void *)g_simpleEffectSrc, sizeof(g_simpleEffectSrc), NULL, NULL, // pDefines NULL, // pIncludes 0, // HLSL flags 0, // FXFlags &pCompiledEffect, &pErrors); if (pErrors) { LPVOID l_pError = NULL; l_pError = pErrors->GetBufferPointer(); // then cast to a char* to see it // in the locals window fprintf(stdout, "Compilation error: \n %s", (char *)l_pError); } AssertOrQuit(SUCCEEDED(hr)); hr = sFnPtr_D3D10CreateEffectFromMemory( pCompiledEffect->GetBufferPointer(), pCompiledEffect->GetBufferSize(), 0, // FXFlags g_pd3dDevice, NULL, &g_pSimpleEffect); pCompiledEffect->Release(); g_pDrawQuadTechnique = g_pSimpleEffect->GetTechniqueByName("Render"); g_pDrawBirdsTechnique = g_pSimpleEffect->GetTechniqueByName("DrawBirds"); g_pvQuadRect = g_pSimpleEffect->GetVariableByName("g_vQuadRect")->AsVector(); g_pTexture2D = g_pSimpleEffect->GetVariableByName("g_Texture2D")->AsShaderResource(); g_pd3dDevice->IASetInputLayout(NULL); g_pd3dDevice->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); } D3D10_RASTERIZER_DESC rasterizerState; rasterizerState.FillMode = D3D10_FILL_SOLID; rasterizerState.CullMode = D3D10_CULL_FRONT; rasterizerState.FrontCounterClockwise = false; rasterizerState.DepthBias = false; rasterizerState.DepthBiasClamp = 0; rasterizerState.SlopeScaledDepthBias = 0; rasterizerState.DepthClipEnable = false; rasterizerState.ScissorEnable = false; rasterizerState.MultisampleEnable = false; rasterizerState.AntialiasedLineEnable = false; g_pd3dDevice->CreateRasterizerState(&rasterizerState, &g_pRasterState); g_pd3dDevice->RSSetState(g_pRasterState); return S_OK; } //////////////////////////////////////////////////////////////////////////////// //! Draw the final result on the screen //////////////////////////////////////////////////////////////////////////////// void DrawScene() { // Clear the backbuffer to a black color float ClearColor[4] = {0.18f, 0.63f, 1.f, 1.0f}; g_pd3dDevice->ClearRenderTargetView(g_pSwapChainRTV, ClearColor); // // draw the 2d texture // float quadRect[4] = {-0.98f, -0.98f, 1.96f, 1.96f}; g_pvQuadRect->SetFloatVector((float *)&quadRect); #if 0 g_pDrawQuadTechnique->GetPassByIndex(0)->Apply(0); g_pd3dDevice->Draw(4, 0); #else g_pDrawBirdsTechnique->GetPassByIndex(0)->Apply(0); ID3D10ShaderResourceView *pSRVViews[1]; pSRVViews[0] = g_pPositionsSRV; g_pd3dDevice->VSSetShaderResources(0, 1, pSRVViews); g_pd3dDevice->Draw(3 * nBirds, 0); #endif // Present the backbuffer contents to the display g_pSwapChain->Present(0, 0); } //----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- void Cleanup() { // unregister the Cuda resources cudaGraphicsUnregisterResource(g_pCudaResourcePos); getLastCudaError( "cudaGraphicsUnregisterResource (g_pCudaResourcePos) failed"); cudaGraphicsUnregisterResource(g_pCudaResourceNewPos); getLastCudaError( "cudaGraphicsUnregisterResource (g_pCudaResourceNewPos) failed"); // // clean up Direct3D // { if (g_pInputLayout != NULL) { g_pInputLayout->Release(); } if (g_pSimpleEffect != NULL) { g_pSimpleEffect->Release(); } if (g_pSwapChainRTV != NULL) { g_pSwapChainRTV->Release(); } if (g_pSwapChain != NULL) { g_pSwapChain->Release(); } if (g_pd3dDevice != NULL) { g_pd3dDevice->Release(); } } // Uninitialize CUDA checkCudaErrors(cudaFree(d_pairs)); checkCudaErrors(cudaFree(d_triples)); checkCudaErrors(cudaFree(d_neighbors)); checkCudaErrors(cudaFree(d_leftgoals)); checkCudaErrors(cudaFree(d_rightgoals)); checkCudaErrors(cudaFree(d_hasproxy)); checkCudaErrors(cudaFree(d_params)); SAFE_RELEASE(g_pPositions); SAFE_RELEASE(g_pNewPositions); SAFE_RELEASE(g_pPositionsSRV); } //----------------------------------------------------------------------------- // Name: Render() // Desc: Launches the CUDA kernels to fill in the texture data //----------------------------------------------------------------------------- void Render() { // // map the resources we've registered so we can access them in Cuda // - it is most efficient to map and unmap all resources in a single call, // and to have the map/unmap calls be the boundary between using the GPU // for Direct3D and Cuda cudaGraphicsMapResources(1, &g_pCudaResourcePos, 0); getLastCudaError("cudaGraphicsMapResources(3) failed"); cudaGraphicsMapResources(1, &g_pCudaResourceNewPos, 0); getLastCudaError("cudaGraphicsMapResources(3) failed"); getLastCudaError("cudaD3D10MapResources(3) failed"); float2 *mappedpositions, *new_mappedpositions; static clock_t start, nextstart, end, end2, end3; static DWORD tick_start, next_tick_start = 0, tick_end; static uint step = 0; if (g_runCPU) { if (!step) { std::cout << "CPU simulation \n"; } if (!(step % 100)) { tick_start = next_tick_start; next_tick_start = GetTickCount(); } simulate(new_positions, positions, nBirds); std::swap(positions, new_positions); g_pd3dDevice->UpdateSubresource(g_pPositions, 0, NULL, positions, 0, 0); if (!(step % 100) && step) { tick_end = GetTickCount(); std::cout << "CPU, step " << step << " \n"; std::cout << "time per step " << float(tick_end - tick_start) / 100.f << " ms \n"; } step++; } else { if (!step) { std::cout << "CUDA simulation \n"; } size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer( (void **)&mappedpositions, &num_bytes, g_pCudaResourcePos)); getLastCudaError("cudaGraphicsResourceGetMappedPointer 1 failed \n"); checkCudaErrors(cudaGraphicsResourceGetMappedPointer( (void **)&new_mappedpositions, &num_bytes, g_pCudaResourceNewPos)); getLastCudaError("cudaGraphicsResourceGetMappedPointer 2 failed \n"); cuda_simulate(new_mappedpositions, mappedpositions, nBirds, d_hasproxy, d_neighbors, d_leftgoals, d_rightgoals, d_pairs, d_triples, d_params); std::swap(g_pCudaResourceNewPos, g_pCudaResourcePos); step++; } // // unmap the resources // cudaGraphicsUnmapResources(1, &g_pCudaResourcePos, 0); getLastCudaError("cudaGraphicsUnmapResources(3) failed"); cudaGraphicsUnmapResources(1, &g_pCudaResourceNewPos, 0); getLastCudaError("cudaGraphicsUnmapResources(3) failed"); // draw the scene using them // DrawScene(); } //----------------------------------------------------------------------------- // Name: MsgProc() // Desc: The window's message handler //----------------------------------------------------------------------------- static LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam) { switch (msg) { case WM_KEYDOWN: if (wParam == VK_ESCAPE) { g_bDone = true; Cleanup(); PostQuitMessage(0); return 0; } if (wParam == 'r' || wParam == 'R') { g_seed = (unsigned)time(NULL); srand(g_seed); for (uint i = 0; i < nBirds; i++) { positions[i].x = (float)rand() / (RAND_MAX + 1) * 768 - 500; positions[i].y = (float)rand() / (RAND_MAX + 1) * 768 - 300; } g_pd3dDevice->UpdateSubresource(g_pPositions, 0, NULL, positions, 0, 0); } if (wParam == 'g' || wParam == 'G') { g_runCPU = !g_runCPU; srand(g_seed); for (uint i = 0; i < nBirds; i++) { positions[i].x = (float)rand() / (RAND_MAX + 1) * 768 - 500; positions[i].y = (float)rand() / (RAND_MAX + 1) * 768 - 300; } g_pd3dDevice->UpdateSubresource(g_pPositions, 0, NULL, positions, 0, 0); } break; case WM_DESTROY: g_bDone = true; Cleanup(); PostQuitMessage(0); return 0; case WM_PAINT: ValidateRect(hWnd, NULL); return 0; } return DefWindowProc(hWnd, msg, wParam, lParam); } void initForCUDA(uint numBirds) { uint i, j, k = 0, l = 0; uint2 *p = pairs; for (i = 0; i < numBirds; i++) for (j = i + 1; j < numBirds; j++) { p->x = i; p->y = j; p++; } checkCudaErrors(cudaMemcpy(d_pairs, pairs, nBirds * (nBirds - 1) * sizeof(uint2) / 2, cudaMemcpyHostToDevice)); for (i = 0; i < numBirds; i++) for (j = i + 1; j < numBirds; j++) for (k = j + 1; k < numBirds; k++) { triples[l].x = i; triples[l].y = j; triples[l].z = k; l++; } checkCudaErrors( cudaMemcpy(d_triples, triples, nBirds * (nBirds - 1) * (nBirds - 2) * sizeof(uint3) / 6, cudaMemcpyHostToDevice)); params->alpha = 90.f; params->upwashX = 30.f; params->upwashY = 50.f; params->wingspan = 50.f; params->dX = .5f; params->dY = .5f; params->epsilon = 30.f; params->lambda = -0.1073f * params->wingspan; checkCudaErrors( cudaMemcpy(d_params, params, sizeof(Params), cudaMemcpyHostToDevice)); memset(leftgoals, 0, nBirds * nBirds * sizeof(bool)); memset(rightgoals, 0, nBirds * nBirds * sizeof(bool)); cudaMemset(d_neighbors, 0, nBirds * nBirds * sizeof(bool)); cudaMemset(d_leftgoals, 0, nBirds * nBirds * sizeof(bool)); cudaMemset(d_rightgoals, 0, nBirds * nBirds * sizeof(bool)); cudaMemset(d_hasproxy, 0, nBirds * sizeof(bool)); } void initialize(uint numBirds) { positions = new float2[numBirds]; new_positions = new float2[numBirds]; pairs = new uint2[numBirds * (numBirds - 1) / 2]; triples = new uint3[numBirds * (numBirds - 1) * (numBirds - 2) / 6]; params = new Params; leftgoals = new bool[numBirds * numBirds]; rightgoals = new bool[numBirds * numBirds]; for (uint i = 0; i < numBirds; i++) { positions[i].x = (float)rand() / (RAND_MAX + 1) * 768 - 500; positions[i].y = (float)rand() / (RAND_MAX + 1) * 768 - 300; } if (!g_runCPU) { initForCUDA(numBirds); } g_wingTips = new WingTip[2 * numBirds]; } float2 diff(float2 pos0, float2 pos1) { float2 ret; ret.x = pos1.x - pos0.x; ret.y = pos1.y - pos0.y; return ret; } float cross(float2 vec0, float2 vec1) { return vec0.x * vec1.y - vec0.y * vec1.x; } float norm(float2 pos) { return sqrt(pos.x * pos.x + pos.y * pos.y); } float dist(float2 pos0, float2 pos1) { return sqrt((pos0.x - pos1.x) * (pos0.x - pos1.x) + (pos0.y - pos1.y) * (pos0.y - pos1.y)); } bool isInsideQuad(float2 pos0, float2 pos1, float width, float height) { if (fabs(pos0.x - pos1.x) < 0.5f * width && fabs(pos0.y - pos1.y) < 0.5f * height) { return true; } else { return false; } } bool compare(WingTip &t1, WingTip &t2) { return t1.x < t2.x ? true : false; } bool compareGoals(ViewGoal &g1, ViewGoal &g2) { return g1.dist < g2.dist ? true : false; } float sign(float x) { if (x > 0.f) { return 1.f; } else if (x < 0.f) { return -1.f; } return 0.f; } bool isVisible(float2 pos, float2 goal) { float2 leftBorder, rightBorder; leftBorder.x = goal.x - (0.5f * wingspan + lambda) - pos.x; leftBorder.y = goal.y - pos.y; rightBorder.x = goal.x + (0.5f * wingspan + lambda) - pos.x; rightBorder.y = goal.y - pos.y; for (uint j = 0; j < nBirds; j++) { if (positions[j].y >= goal.y || positions[j].y <= pos.y) { continue; } float2 dirl, dirr; dirl.x = positions[j].x - pos.x + 0.5f * wingspan; dirl.y = positions[j].y - pos.y; dirr.x = positions[j].x - pos.x - 0.5f * wingspan; dirr.y = positions[j].y - pos.y; if (cross(leftBorder, dirl) < 0 && cross(rightBorder, dirr) > 0) { return false; } } return true; } void simulate(float2 *newPos, float2 *curPos, uint numBirds) { uint i, j, k, nGaps; std::vector vWingTips; Gap g_gaps[nBirds + 1]; for (i = 0; i < numBirds; i++) { WingTip tip; tip.x = curPos[i].x - 0.5f * wingspan; tip.y = curPos[i].y; tip.lr = 1; vWingTips.push_back(tip); tip.x = curPos[i].x + 0.5f * wingspan; tip.y = curPos[i].y; tip.lr = -1; vWingTips.push_back(tip); } bool isSorted = false; for (i = 0; i < numBirds; i++) { std::vector vViewGoals; bool useRule1 = true; newPos[i].x = curPos[i].x; newPos[i].y = curPos[i].y; uint upwashCount = 0; std::vector vNeighbors; for (j = 0; j < numBirds; j++) { if (j == i || curPos[j].y < curPos[i].y) { continue; } float2 curPosShiftedBack; curPosShiftedBack.x = curPos[j].x; curPosShiftedBack.y = curPos[j].y - 0.5f * upwashY; if (isInsideQuad(curPos[i], curPosShiftedBack, 2.f * (wingspan + lambda + upwashX), upwashY)) { uint neighbor = j; vNeighbors.push_back(neighbor); if (useRule1) { useRule1 = false; } if (curPos[i].x > curPos[j].x + (wingspan + lambda + 0.5f * upwashX) || curPos[i].x < curPos[j].x - (wingspan + lambda + 0.5f * upwashX)) { upwashCount++; } } } // if rule 1 is valid, find nearest bird and move to it float d = 0.f, minDist = 1000.f; float2 dij; dij.x = 0.f; dij.y = 0.f; uint nearest = 1000; if (useRule1) { for (j = 0; j < numBirds; j++) { if (j == i || curPos[j].y < curPos[i].y) { continue; } if ((d = norm(diff(curPos[i], curPos[j]))) < minDist) { minDist = d; nearest = j; dij = diff(curPos[i], curPos[j]); } } if (!d) { continue; } d ? dij.x = dij.x / d : dij.x = 0.f; d ? dij.y = dij.y / d : dij.y = 0.f; newPos[i].x = curPos[i].x + dX * dij.x; newPos[i].y = curPos[i].y + dY * dij.y; } else { if (!isSorted) { std::sort(vWingTips.begin(), vWingTips.end(), compare); isSorted = true; } // find all gaps that are big enough int count = 0; bool gapBegin = true; g_gaps[0].left.x = -1000.f; g_gaps[0].left.y = 0.f; j = 0; ViewGoal goal; for (k = 0; k < 2 * numBirds; k++) { if (vWingTips[k].y <= curPos[i].y) // look for gaps only ahead { continue; } count += vWingTips[k].lr; if (gapBegin && 1 == count) { gapBegin = false; g_gaps[j].right.x = vWingTips[k].x; g_gaps[j].right.y = vWingTips[k].y; if (g_gaps[j].right.x - g_gaps[j].left.x > wingspan + 2.f * lambda) { goal.pos.x = g_gaps[j].right.x - (0.5f * wingspan + lambda); goal.pos.y = g_gaps[j].right.y; goal.dist = fabs(goal.pos.x - curPos[i].x); vViewGoals.push_back(goal); if (j) { goal.pos.x = g_gaps[j].left.x + (0.5f * wingspan + lambda); goal.pos.y = g_gaps[j].left.y; goal.dist = fabs(goal.pos.x - curPos[i].x); vViewGoals.push_back(goal); } } } else if (!count) { j++; gapBegin = true; g_gaps[j].left.x = vWingTips[k].x; // + 0.5f * wingspan + lambda; g_gaps[j].left.y = vWingTips[k].y; } } g_gaps[j].right.x = 1000.f; g_gaps[j].right.y = 0.f; goal.pos.x = g_gaps[j].left.x + (0.5f * wingspan + lambda); goal.pos.y = g_gaps[j].left.y; goal.dist = fabs(goal.pos.x - curPos[i].x); vViewGoals.push_back(goal); nGaps = j + 1; // search the closest gap for unobstructed view minDist = 1000.f; dij.x = 0.f; dij.y = 0.f; for (j = 0; j < nGaps; j++) { if ((d = norm(diff(curPos[i], g_gaps[j].left))) < minDist) { minDist = d; dij = diff(curPos[i], g_gaps[j].left); } if ((d = norm(diff(curPos[i], g_gaps[j].right))) < minDist) { minDist = d; dij = diff(curPos[i], g_gaps[j].right); } } std::sort(vViewGoals.begin(), vViewGoals.end(), compareGoals); if (vViewGoals.size()) { if (vViewGoals[0].dist <= dX) { continue; } for (j = 0; j < vViewGoals.size(); j++) { if (!isVisible(curPos[i], vViewGoals[j].pos)) { continue; } newPos[i].x = curPos[i].x + sign(vViewGoals[j].pos.x - curPos[i].x) * dX; for (k = 0; k < vNeighbors.size(); k++) { if (curPos[vNeighbors[k]].y >= curPos[i].y && curPos[vNeighbors[k]].y < curPos[i].y + epsilon) { newPos[i].y = curPos[i].y - dY; } else if (curPos[vNeighbors[k]].y < curPos[i].y && curPos[vNeighbors[k]].y > curPos[i].y - epsilon) { newPos[i].y = curPos[i].y + dY; } } break; } } } vNeighbors.clear(); vViewGoals.clear(); } vWingTips.clear(); }