TensorRT Samples: CharRNN
来源:互联网 发布:硬盘磁头坏了恢复数据 编辑:程序博客网 时间:2024/05/21 18:02
关于TensorRT的介绍可以参考: http://blog.csdn.net/fengbingchun/article/details/78469551
GitHub: https://github.com/fengbingchun/CUDA_Test
以下是参考TensorRT 2.1.2中的sampleCharRNN.cpp文件改写的测试代码,文件(charrnn.cpp)内容如下:
#include <assert.h>#include <string>#include <string.h>#include <fstream>#include <iostream>#include <tuple>#include <map>#include <sstream>#include <vector>#include <algorithm>#include <NvInfer.h>#include <NvUtils.h>#include <cuda_runtime_api.h>#include "common.hpp"// reference: TensorRT-2.1.2/samples/sampleMNIST/sampleCharRNN.cpp// demonstrates how to generate a simple RNN based on the charRNN network using the PTB datasetnamespace {// Information describing the network:// int: layer count, batch size, hidden size, seq size, data size, output size// string: input blob name, hidden in blob name, cell in blob name, output blob name, hidden out blob name, cell out blob nametypedef std::tuple<int, int, int, int, int, int, std::string, std::string, std::string, std::string, std::string, std::string> NET_INFO;// These mappings came from training with tensorflow 0.12.1static std::map<char, int> char_to_id{{'#', 40}, { '$', 31}, { '\'', 28}, { '&', 35}, { '*', 49}, { '-', 32}, { '/', 48}, { '.', 27}, { '1', 37}, { '0', 36}, { '3', 39}, { '2', 41}, { '5', 43}, { '4', 47}, { '7', 45}, { '6', 46}, { '9', 38}, { '8', 42}, { '<', 22}, { '>', 23}, { '\0', 24}, { 'N', 26}, { '\\', 44}, { ' ', 0}, { 'a', 3}, { 'c', 13}, { 'b', 20}, { 'e', 1}, { 'd', 12}, { 'g', 18}, { 'f', 15}, { 'i', 6}, { 'h', 9}, { 'k', 17}, { 'j', 30}, { 'm', 14}, { 'l', 10}, { 'o', 5}, { 'n', 4}, { 'q', 33}, { 'p', 16}, { 's', 7}, { 'r', 8}, { 'u', 11}, { 't', 2}, { 'w', 21}, { 'v', 25}, { 'y', 19}, { 'x', 29}, { 'z', 34}};// A mapping from index to character.static std::vector<char> id_to_char{{' ', 'e', 't', 'a', 'n', 'o', 'i', 's', 'r', 'h', 'l', 'u', 'd', 'c', 'm', 'f', 'p', 'k', 'g', 'y', 'b', 'w', '<', '>', '\0', 'v', 'N', '.', '\'', 'x', 'j', '$', '-', 'q', 'z', '&', '0', '1', '9', '3', '#', '2', '8', '5', '\\', '7', '6', '4', '/', '*'}};// Our weight files are in a very simple space delimited format.std::map<std::string, nvinfer1::Weights> loadWeights(const std::string& file){ std::map<std::string, nvinfer1::Weights> weightMap; std::ifstream input(file); if (!input.is_open()) { fprintf(stderr, "Unable to load weight file: %s\n", file.c_str()); return weightMap;} int32_t count; input >> count; if (count <= 0) { fprintf(stderr, "Invalid weight map file: %d\n", count); return weightMap; } while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t type, size; std::string name; input >> name >> std::dec >> type >> size; wt.type = static_cast<nvinfer1::DataType>(type); if (wt.type == nvinfer1::DataType::kFLOAT) { uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; } else if (wt.type == nvinfer1::DataType::kHALF) { uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; } wt.count = size; weightMap[name] = wt; } return weightMap;}// Reshape plugin to feed RNN into FC layer correctly.class Reshape : public nvinfer1::IPlugin {public:Reshape(size_t size) : mSize(size) {} Reshape(const void*buf, size_t size) { assert(size == sizeof(mSize)); mSize = *static_cast<const size_t*>(buf); }int getNbOutputs() const override {return 1; }int initialize() override {return 0; }void terminate() override {}size_t getWorkspaceSize(int) const override { return 0;}int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { cudaMemcpyAsync(static_cast<float*>(outputs[0]), static_cast<const float*>(inputs[0]), sizeof(float) * mSize * batchSize, cudaMemcpyDefault, stream); return 0; }size_t getSerializationSize() override { return sizeof(mSize); }void serialize(void* buffer) override { (*static_cast<size_t*>(buffer)) = mSize; }void configure(const nvinfer1::Dims*, int, const nvinfer1::Dims*, int, int)override { } // The RNN outputs in {L, N, C}, but FC layer needs {C, 1, 1}, so we can convert RNN // output to {L*N, C, 1, 1} and TensorRT will handle the rest.nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override{ assert(nbInputDims == 1 && index == 0 && inputs[index].nbDims == 3);return nvinfer1::DimsNCHW(inputs[index].d[1] * inputs[index].d[0], inputs[index].d[2], 1, 1);}private: size_t mSize{0};};class PluginFactory : public nvinfer1::IPluginFactory{public:// deserialization plugin implementationnvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override{ assert(!strncmp(layerName, "reshape", 7)); if (!mPlugin) mPlugin = new Reshape(serialData, serialLength); return mPlugin; } void destroyPlugin() { if (mPlugin) delete mPlugin; mPlugin = nullptr; }private: Reshape *mPlugin{nullptr};}; // PluginFactory// TensorFlow weight parameters for BasicLSTMCellnvinfer1::Weights convertRNNWeights(nvinfer1::Weights input, const NET_INFO& info){ float* ptr = static_cast<float*>(malloc(sizeof(float)*input.count)); int indir[4]{ 1, 2, 0, 3 }; int order[5]{ 0, 1, 4, 2, 3}; int dims[5]{std::get<0>(info), 2, 4, std::get<2>(info), std::get<2>(info)}; nvinfer1::utils::reshapeWeights(input, dims, order, ptr, 5); nvinfer1::utils::transposeSubBuffers(ptr, nvinfer1::DataType::kFLOAT, std::get<0>(info) * 2, std::get<2>(info) * std::get<2>(info), 4); int subMatrix = std::get<2>(info) * std::get<2>(info); int layerOffset = 8 * subMatrix; for (int z = 0; z < std::get<0>(info); ++z) { nvinfer1::utils::reorderSubBuffers(ptr + z * layerOffset, indir, 4, subMatrix * sizeof(float)); nvinfer1::utils::reorderSubBuffers(ptr + z * layerOffset + 4 * subMatrix, indir, 4, subMatrix * sizeof(float)); } return nvinfer1::Weights{input.type, ptr, input.count};}// TensorFlow bias parameters for BasicLSTMCellnvinfer1::Weights convertRNNBias(nvinfer1::Weights input, const NET_INFO& info){ float* ptr = static_cast<float*>(malloc(sizeof(float)*input.count*2)); std::fill(ptr, ptr + input.count*2, 0); const float* iptr = static_cast<const float*>(input.values); int indir[4]{ 1, 2, 0, 3 }; for (int z = 0, y = 0; z < std::get<0>(info); ++z) for (int x = 0; x < 4; ++x, ++y) std::copy(iptr + y * std::get<2>(info) , iptr + (y + 1) * std::get<2>(info), ptr + (z * 8 + indir[x]) * std::get<2>(info)); return nvinfer1::Weights{input.type, ptr, input.count*2};}// The fully connected weights from tensorflow are transposed compared to the order that tensorRT expects them to be in.nvinfer1::Weights transposeFCWeights(nvinfer1::Weights input, const NET_INFO& info){ float* ptr = static_cast<float*>(malloc(sizeof(float)*input.count)); const float* iptr = static_cast<const float*>(input.values); assert(input.count == std::get<2>(info) * std::get<5>(info)); for (int z = 0; z < std::get<2>(info); ++z) for (int x = 0; x < std::get<5>(info); ++x) ptr[x * std::get<2>(info) + z] = iptr[z * std::get<5>(info) + x]; return nvinfer1::Weights{input.type, ptr, input.count};}int APIToModel(std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::IHostMemory** modelStream, const NET_INFO& info, Logger logger){ // create the builder nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger); // create the model to populate the network, then set the outputs and create an engine nvinfer1::INetworkDefinition* network = builder->createNetwork(); auto data = network->addInput(std::get<6>(info).c_str(), nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{ std::get<3>(info), std::get<1>(info), std::get<4>(info)}); CHECK(data != nullptr); auto hiddenIn = network->addInput(std::get<7>(info).c_str(), nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{ std::get<0>(info), std::get<1>(info), std::get<2>(info)}); CHECK(hiddenIn != nullptr); auto cellIn = network->addInput(std::get<8>(info).c_str(), nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{ std::get<0>(info), std::get<1>(info), std::get<2>(info)}); CHECK(cellIn != nullptr); // Create an RNN layer w/ 2 layers and 512 hidden states auto tfwts = weightMap["rnnweight"]; nvinfer1::Weights rnnwts = convertRNNWeights(tfwts, info); auto tfbias = weightMap["rnnbias"]; nvinfer1::Weights rnnbias = convertRNNBias(tfbias, info); auto rnn = network->addRNN(*data, std::get<0>(info), std::get<2>(info), std::get<3>(info), nvinfer1::RNNOperation::kLSTM, nvinfer1::RNNInputMode::kLINEAR, nvinfer1::RNNDirection::kUNIDIRECTION, rnnwts, rnnbias); CHECK(rnn != nullptr); rnn->getOutput(0)->setName("RNN output"); rnn->setHiddenState(*hiddenIn); if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM) rnn->setCellState(*cellIn); Reshape reshape(std::get<3>(info) * std::get<1>(info) * std::get<2>(info)); nvinfer1::ITensor *ptr = rnn->getOutput(0); auto plugin = network->addPlugin(&ptr, 1, reshape); plugin->setName("reshape"); // Add a second fully connected layer with 50 outputs. auto tffcwts = weightMap["rnnfcw"]; auto wts = transposeFCWeights(tffcwts, info); auto bias = weightMap["rnnfcb"]; auto fc = network->addFullyConnected(*plugin->getOutput(0), std::get<5>(info), wts, bias); CHECK(fc != nullptr); fc->getOutput(0)->setName("FC output"); // Add a softmax layer to determine the probability. auto prob = network->addSoftMax(*fc->getOutput(0)); CHECK(prob != nullptr); prob->getOutput(0)->setName(std::get<9>(info).c_str()); network->markOutput(*prob->getOutput(0)); rnn->getOutput(1)->setName(std::get<10>(info).c_str()); network->markOutput(*rnn->getOutput(1)); if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM) { rnn->getOutput(2)->setName(std::get<11>(info).c_str()); network->markOutput(*rnn->getOutput(2)); } // Build the engine builder->setMaxBatchSize(1); builder->setMaxWorkspaceSize(1 << 25); // Store the transformed weights in the weight map so the memory can be properly released later. weightMap["rnnweight2"] = rnnwts; weightMap["rnnbias2"] = rnnbias; weightMap["rnnfcw2"] = wts; auto engine = builder->buildCudaEngine(*network); CHECK(engine != nullptr); // we don't need the network any more network->destroy(); // serialize the engine, then close everything down (*modelStream) = engine->serialize(); engine->destroy(); builder->destroy(); return 0;}void stepOnce(float** data, void** buffers, int* sizes, int* indices, int numBindings, cudaStream_t& stream, nvinfer1::IExecutionContext &context){ for (int z = 0, w = numBindings/2; z < w; ++z) cudaMemcpyAsync(buffers[indices[z]], data[z], sizes[z] * sizeof(float), cudaMemcpyHostToDevice, stream); // Execute asynchronously context.enqueue(1, buffers, stream, nullptr); // DMA the input from the GPU for (int z = numBindings/2, w = numBindings; z < w; ++z) cudaMemcpyAsync(data[z], buffers[indices[z]], sizes[z] * sizeof(float), cudaMemcpyDeviceToHost, stream); // Copy Ct/Ht to the Ct-1/Ht-1 slots. cudaMemcpyAsync(data[1], buffers[indices[4]], sizes[1] * sizeof(float), cudaMemcpyDeviceToHost, stream); cudaMemcpyAsync(data[2], buffers[indices[5]], sizes[2] * sizeof(float), cudaMemcpyDeviceToHost, stream);}bool doInference(nvinfer1::IExecutionContext& context, const std::string& input, const std::string& expected, std::map<std::string, nvinfer1::Weights>&weightMap, const NET_INFO& info){ const nvinfer1::ICudaEngine& engine = context.getEngine(); // We have 6 outputs for LSTM, this needs to be changed to 4 for any other RNN type static const int numBindings = 6; assert(engine.getNbBindings() == numBindings); void* buffers[numBindings]; float* data[numBindings]; std::fill(buffers, buffers + numBindings, nullptr); std::fill(data, data + numBindings, nullptr); const char* names[numBindings] = {std::get<6>(info).c_str(), std::get<7>(info).c_str(), std::get<8>(info).c_str(), std::get<9>(info).c_str(), std::get<10>(info).c_str(), std::get<11>(info).c_str() }; int indices[numBindings]; std::fill(indices, indices + numBindings, -1); int sizes[numBindings] = { std::get<3>(info) * std::get<1>(info) * std::get<4>(info), std::get<0>(info) * std::get<1>(info) * std::get<2>(info), std::get<0>(info) * std::get<1>(info) * std::get<2>(info), std::get<5>(info), std::get<0>(info) * std::get<1>(info) * std::get<2>(info), std::get<0>(info) * std::get<1>(info) * std::get<2>(info) }; for (int x = 0; x < numBindings; ++x) { // In order to bind the buffers, we need to know the names of the input and output tensors. // note that indices are guaranteed to be less than IEngine::getNbBindings() indices[x] = engine.getBindingIndex(names[x]); if (indices[x] == -1) continue; // create GPU buffers and a stream assert(indices[x] < numBindings); cudaMalloc(&buffers[indices[x]], sizes[x] * sizeof(float)); data[x] = new float[sizes[x]]; } cudaStream_t stream; cudaStreamCreate(&stream); // Initialize input/hidden/cell state to zero for (int x = 0; x < numBindings; ++x) std::fill(data[x], data[x] + sizes[x], 0.0f); auto embed = weightMap["embed"]; std::string genstr; assert(std::get<1>(info) == 1 && "This code assumes batch size is equal to 1."); // Seed the RNN with the input. for (auto &a : input) { std::copy(reinterpret_cast<const float*>(embed.values) + char_to_id[a]*std::get<4>(info), reinterpret_cast<const float*>(embed.values) + char_to_id[a]*std::get<4>(info) + std::get<4>(info), data[0]); stepOnce(data, buffers, sizes, indices, 6, stream, context); cudaStreamSynchronize(stream); genstr.push_back(a); } // Now that we have gone through the initial sequence, lets make sure that we get the sequence out that // we are expecting. for (size_t x = 0, y = expected.size(); x < y; ++x) { std::copy(reinterpret_cast<const float*>(embed.values) + char_to_id[*genstr.rbegin()]*std::get<4>(info), reinterpret_cast<const float*>(embed.values) + char_to_id[*genstr.rbegin()]*std::get<4>(info) + std::get<4>(info), data[0]); stepOnce(data, buffers, sizes, indices, 6, stream, context); cudaStreamSynchronize(stream);float* probabilities = reinterpret_cast<float*>(data[indices[3]]);ptrdiff_t idx = std::max_element(probabilities, probabilities + sizes[3]) - probabilities; genstr.push_back(id_to_char[idx]); } fprintf(stdout, "Received: %s\n", genstr.c_str() + input.size()); // release the stream and the buffers cudaStreamDestroy(stream); for (int x = 0; x < numBindings; ++x) { cudaFree(buffers[indices[x]]); if (data[x]) delete [] data[x]; } return genstr == (input + expected);}} // namespaceint test_charrnn(){ const NET_INFO info(2, 1, 512, 1, 512, 50, "data", "hiddenIn", "cellIn", "prob", "hiddenOut", "cellOut"); Logger logger; // multiple instances of IRuntime and/or IBuilder must all use the same logger // create a model using the API directly and serialize it to a stream nvinfer1::IHostMemory* modelStream{ nullptr }; std::map<std::string, nvinfer1::Weights> weightMap = loadWeights("models/char-rnn.wts"); APIToModel(weightMap, &modelStream, info, logger); const std::vector<std::string> in_strs {"customer serv", "business plans", "help", "slightly under", "market", "holiday cards", "bring it", "what time", "the owner thinks", "money can be use"}; const std::vector<std::string> out_strs { "es and the", " to be a", "en and", "iting the company", "ing and", " the company", " company said it will", "d and the company", "ist with the", "d to be a"}; CHECK(in_strs.size() == out_strs.size()); PluginFactory pluginFactory; nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger); nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), &pluginFactory); nvinfer1::IExecutionContext* context = engine->createExecutionContext(); for (int num = 0; num < in_strs.size(); ++num) { bool pass {false}; fprintf(stdout, "RNN Warmup: %s, Expect: %s\n", in_strs[num].c_str(), out_strs[num].c_str()); pass = doInference(*context, in_strs[num], out_strs[num], weightMap, info); if (!pass) fprintf(stderr, "Failure!\n"); } if (modelStream) modelStream->destroy(); for (auto& mem : weightMap) { free((void*)(mem.second.values)); } // destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); pluginFactory.destroyPlugin(); return 0;}执行结果如下:
测试代码编译步骤如下(ReadMe.txt):
在Linux下通过CMake编译TensorRT_Test中的测试代码步骤:1. 将终端定位到CUDA_Test/prj/linux_tensorrt_cmake,依次执行如下命令:$ mkdir build$ cd build$ cmake ..$ make (生成TensorRT_Test执行文件)$ ln -s ../../../test_data/models ./ (将models目录软链接到build目录下)$ ln -s ../../../test_data/images ./ (将images目录软链接到build目录下)$ ./TensorRT_Test2. 对于有需要用OpenCV参与的读取图像的操作,需要先将对应文件中的图像路径修改为Linux支持的路径格式
GitHub: https://github.com/fengbingchun/CUDA_Test
阅读全文
0 0
- TensorRT Samples: CharRNN
- TensorRT Samples: MNIST
- TensorRT Samples: MNIST API
- TensorRT Samples: GoogleNet
- TensorRT Samples: MNIST(serialize TensorRT model)
- TensorRT Samples: MNIST(Plugin, add a custom layer)
- Samples
- Nvidia TensorRT
- TensorRT简介
- TensorRT的workflow
- 【ubuntu】安装tensorRT
- TensorRT 加速 资源整理
- NVidia TensorRT 运行 Caffe 模型
- NVidia TensorRT 运行 Caffe 模型
- TensorRT(1)--8-bit Inference
- AutoComplete Samples
- lhgdialog - samples
- Ajaxcontroltoolkit/Samples
- SpringMVC源码研究之DispatcherServlet处理请求
- iOS 第三方微信登录集成-错误-40029
- 【我的Java笔记】集合工具类_Collections
- libcurl 上传文件
- OpenLayers4叠加显示点
- TensorRT Samples: CharRNN
- BigDecimal加减乘除运算
- 6.12
- 制造依赖与客体剥夺
- 探索JVM内存对象——从生到死
- redis基本命令
- Longest Palindromic Substring(动态规划求解)
- ubuntu14.04+cuda8.0 重新安装cuda
- 关键字static、Const、Volatile的作用是什么