import time
import os
def long_time_task():
print('当前进程: {}'.format(os.getpid()))
time.sleep(2)
print("结果: {}".format(8 ** 20))
if __name__ == "__main__":
print('当前母进程: {}'.format(os.getpid()))
start = time.time()
for i in range(2):
long_time_task()
end = time.time()
print("用时{}秒".format((end-start)))
from multiprocessing import Pool, cpu_count
import os
import time
def long_time_task(i):
print('子进程: {} - 任务{}'.format(os.getpid(), i))
time.sleep(2)
print("结果: {}".format(8 ** 20))
if __name__=='__main__':
print("CPU内核数:{}".format(cpu_count()))
print('当前母进程: {}'.format(os.getpid()))
start = time.time()
p = Pool(4)
for i in range(5):
p.apply_async(long_time_task, args=(i,))
print('等待所有子进程完成。')
p.close()
p.join()
end = time.time()
print("总共用时{}秒".format((end - start)))
from multiprocessing import Process, Queue
import os, time, random
# 写数据进程执行的代码:
def write(q):
print('Process to write: {}'.format(os.getpid()))
for value in ['A', 'B', 'C']:
print('Put %s to queue...' % value)
q.put(value)
time.sleep(random.random())
# 读数据进程执行的代码:
def read(q):
print('Process to read:{}'.format(os.getpid()))
while True:
value = q.get(True)
print('Get %s from queue.' % value)
if __name__=='__main__':
# 父进程创建Queue,并传给各个子进程:
q = Queue()
pw = Process(target=write, args=(q,))
pr = Process(target=read, args=(q,))
# 启动子进程pw,写入:
pw.start()
# 启动子进程pr,读取:
pr.start()
# 等待pw结束:
pw.join()
# pr进程里是死循环,无法等待其结束,只能强行终止:
pr.terminate()
输出结果如下所示:
Process to write: 3036
Put A to queue...
Process to read:9408
Get A from queue.
Put B to queue...
Get B from queue.
Put C to queue...
Get C from queue.
import threading
import time
def long_time_task(i):
print('当前子线程: {} 任务{}'.format(threading.current_thread().name, i))
time.sleep(2)
print("结果: {}".format(8 ** 20))
if __name__=='__main__':
start = time.time()
print('这是主线程:{}'.format(threading.current_thread().name))
thread_list = []
for i in range(1, 3):
t = threading.Thread(target=long_time_task, args=(i, ))
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
end = time.time()
print("总共用时{}秒".format((end - start)))
import threading
import time
def long_time_task():
print('当子线程: {}'.format(threading.current_thread().name))
time.sleep(2)
print("结果: {}".format(8 ** 20))
if __name__=='__main__':
start = time.time()
print('这是主线程:{}'.format(threading.current_thread().name))
for i in range(5):
t = threading.Thread(target=long_time_task, args=())
t.setDaemon(True)
t.start()
end = time.time()
print("总共用时{}秒".format((end - start)))
import os
import tensorrt as trt
os.environ["CUDA_VISIBLE_DEVICES"]='0'
TRT_LOGGER = trt.Logger()
onnx_file_path = 'Unet375-simple.onnx'
engine_file_path = 'Unet337.trt'
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1 << 28 # 256MiB
builder.max_batch_size = 1
# Parse model file
if not os.path.exists(onnx_file_path):
print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path))
exit(0)
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print (parser.get_error(error))
network.get_input(0).shape = [1, 3, 300, 400]
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
#network.mark_output(network.get_layer(network.num_layers-1).get_output(0))
engine = builder.build_cuda_engine(network)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
trtexec --loadEngine=mnist16.trt --batch=1
打印输出:
trtexec会打印出很多时间,这里需要对每个时间的含义进行解释,然后大家各取所需,进行评测。总的打印如下:
[09/06/2021-13:50:34] [I] Average on 10 runs - GPU latency: 2.74553 ms - Host latency: 3.74192 ms (end to end 4.93066 ms, enqueue 0.624805 ms) # 跑了10次,GPU latency: GPU计算耗时, Host latency:GPU输入+计算+输出耗时,end to end:GPU端到端的耗时,eventout - eventin,enqueue:CPU异步耗时
[09/06/2021-13:50:34] [I] Host Latency
[09/06/2021-13:50:34] [I] min: 3.65332 ms (end to end 3.67603 ms)
[09/06/2021-13:50:34] [I] max: 5.95093 ms (end to end 6.88892 ms)
[09/06/2021-13:50:34] [I] mean: 3.71375 ms (end to end 5.30082 ms)
[09/06/2021-13:50:34] [I] median: 3.70032 ms (end to end 5.32935 ms)
[09/06/2021-13:50:34] [I] percentile: 4.10571 ms at 99% (end to end 6.11792 ms at 99%)
[09/06/2021-13:50:34] [I] throughput: 356.786 qps
[09/06/2021-13:50:34] [I] walltime: 3.00741 s
[09/06/2021-13:50:34] [I] Enqueue Time
[09/06/2021-13:50:34] [I] min: 0.248474 ms
[09/06/2021-13:50:34] [I] max: 2.12134 ms
[09/06/2021-13:50:34] [I] median: 0.273987 ms
[09/06/2021-13:50:34] [I] GPU Compute
[09/06/2021-13:50:34] [I] min: 2.69702 ms
[09/06/2021-13:50:34] [I] max: 4.99219 ms
[09/06/2021-13:50:34] [I] mean: 2.73299 ms
[09/06/2021-13:50:34] [I] median: 2.71875 ms
[09/06/2021-13:50:34] [I] percentile: 3.10791 ms at 99%
[09/06/2021-13:50:34] [I] total compute time: 2.93249 s
Host Latency gpu: 输入+计算+输出 三部分的耗时
Enqueue Time:CPU异步的时间(该时间不具有参考意义,因为GPU的计算可能还没有完成)
GPU Compute:GPU计算的耗时
综上,去了Enqueue Time时间都是有意义的
成功执行这些代码的话,程序会以文本格式输出模型的信息,其内容应该和我们在上一节展示的输出一样。 整理一下,用 ONNX Python API 构造模型的代码如下:
import onnx
from onnx import helper
from onnx import TensorProto
# input and output
a = helper.make_tensor_value_info('a', TensorProto.FLOAT, [10, 10])
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 10])
b = helper.make_tensor_value_info('b', TensorProto.FLOAT, [10, 10])
output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [10, 10])
# Mul
mul = helper.make_node('Mul', ['a', 'x'], ['c'])
# Add
add = helper.make_node('Add', ['c', 'b'], ['output'])
# graph and model
graph = helper.make_graph([mul, add], 'linear_func', [a, x, b], [output])
model = helper.make_model(graph)
# save model
onnx.checker.check_model(model)
print(model)
onnx.save(model, 'linear_func.onnx')
老规矩,我们可以用 ONNX Runtime 运行模型,来看看模型是否正确:
import onnxruntime
import numpy as np
sess = onnxruntime.InferenceSession('linear_func.onnx')
a = np.random.rand(10, 10).astype(np.float32)
b = np.random.rand(10, 10).astype(np.float32)
x = np.random.rand(10, 10).astype(np.float32)
output = sess.run(['output'], {'a': a, 'b': b, 'x': x})[0]
assert np.allclose(output, a * x + b)
一切顺利的话,这段代码不会有任何报错信息。这说明我们的模型等价于执行 a * x + b 这个计算。
读取并修改 ONNX 模型
通过用 API 构造 ONNX 模型,我们已经彻底搞懂了 ONNX 由哪些模块组成。现在,让我们看看该如何读取现有的”.onnx”文件并从中提取模型信息。 首先,我们可以用下面的代码读取一个 ONNX 模型:
import onnx
model = onnx.load('linear_func.onnx')
print(model)
在读入之前的 linear_func.onnx 模型后,我们可以直接修改第二个节点的类型 node[1].op_type,把加法变成减法。这样,我们的模型描述的是 a * x - b 这个线性函数。大家感兴趣的话,可以用 ONNX Runtime 运行新模型 linear_func_2.onnx,来验证一下它和 a * x - b 是否等价。
为了应对更复杂的情况,我们来自定义一个奇怪的 my_add 算子。这个算子的输入张量 a, b ,输出 2a + b 的值。我们会先把它在 PyTorch 中实现,再把它导出到 ONNX 中。
为 PyTorch 添加 C++ 拓展
为 PyTorch 添加简单的 C++ 拓展还是很方便的。对于我们定义的 my_add 算子,可以用以下的 C++ 源文件来实现。我们把该文件命名为 “my_add.cpp”:
// my_add.cpp
#include <torch/torch.h>
torch::Tensor my_add(torch::Tensor a, torch::Tensor b)
{
return 2 * a + b;
}
PYBIND11_MODULE(my_lib, m)
{
m.def("my_add", my_add);
}
由于在 PyTorch 中添加 C++ 拓展和模型部署关系不大,这里我们仅给出这个简单的示例,并不对其原理做过多讲解。
在这段代码中,torch::Tensor 就是 C++ 中 torch 的张量类型,它的加法和乘法等运算符均已重载。因此,我们可以像对普通标量一样对张量做加法和乘法。
轻松地完成了算子的实现后,我们用 PYBIND11_MODULE 来为 C++ 函数提供 Python 调用接口。这里的 my_lib 是我们未来要在 Python 里导入的模块名。双引号中的 my_add 是 Python 调用接口的名称,这里我们对齐 C++ 函数的名称,依然用 “my_add”这个名字。
之后,我们可以编写如下的 Python 代码并命名为 “setup.py”,来编译刚刚的 C++ 文件:
from setuptools import setup
from torch.utils import cpp_extension
setup(name='my_add',
ext_modules=[cpp_extension.CppExtension('my_lib', ['my_add.cpp'])],
cmdclass={'build_ext': cpp_extension.BuildExtension})
这段代码使用了 Python 的 setuptools 编译功能和 PyTorch 的 C++ 拓展工具函数,可以编译包含了 torch 库的 C++ 源文件。这里我们需要填写的只有模块名和模块中的源文件名。我们刚刚把模块命名为 my_lib,而源文件只有一个 my_add.cpp,因此拓展模块那一行要写成 ext_modules=[cpp_extension.CppExtension('my_lib', ['my_add.cpp'])],。
之后,像处理普通的 Python 包一样执行安装命令,我们的 C++ 代码就会自动编译了。
python setup.py develop
用 torch.autograd.Function 封装
直接用 Python 接口调用 C++ 函数不太“美观”,一种比较优雅的做法是把这个调用接口封装起来。这里我们用 torch.autograd.Function 来封装算子的底层调用:
import torch
import my_lib
class MyAddFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, a, b):
return my_lib.my_add(a, b)
@staticmethod
def symbolic(g, a, b):
two = g.op("Constant", value_t=torch.tensor([2]))
a = g.op('Mul', a, two)
return g.op('Add', a, b)
Input[0] on model model_static.onnx succeed.
Input[1] on model model_static.onnx error.
Input[2] on model model_static.onnx error.
Input[0] on model model_dynamic_0.onnx succeed.
Input[1] on model model_dynamic_0.onnx succeed.
Input[2] on model model_dynamic_0.onnx error.
Input[0] on model model_dynamic_23.onnx succeed.
Input[1] on model model_dynamic_23.onnx error.
Input[2] on model model_dynamic_23.onnx succeed.
print(exceptions[(1, 'model_static.onnx')])
# output # [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: in for the following indices index: 0 Got: 2 Expected: 1 Please fix either the inputs or the model.
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
x = x * x[0].item()
return x, torch.Tensor([i for i in x])
model = Model()
dummy_input = torch.rand(10)
torch.onnx.export(model, dummy_input, 'a.onnx')
return g.op("Resize",
input,
empty_roi,
empty_scales,
output_size,
coordinate_transformation_mode_s=coordinate_transformation_mode,
cubic_coeff_a_f=-0.75, # only valid when mode="cubic"
mode_s=interpolate_mode, # nearest, linear, or cubic
nearest_mode_s="floor") # only valid when mode="nearest"
rpm -i cuda-repo-rhel7-10-2-local-10.2.89-440.33.01-1.0-1.x86_64.rpm
tar -zxvf cudnn-10.2-linux-x64-v8.0.1.13.tgz
# tar -xzvf TensorRT-${version}.Linux.${arch}-gnu.${cuda}.${cudnn}.tar.gz
tar -xzvf TensorRT-7.1.3.4.CentOS-7.6.x86_64-gnu.cuda-10.2.cudnn8.0.tar.gz
Open Neural Network Exchange(ONNX,开放神经网络交换)格式,是一个用于表示深度学习模型的标准,可使模型在不同框架之间进行转移.最初的ONNX专注于推理(评估)所需的功能。 ONNX解释计算图的可移植,它使用graph的序列化格式
pth 转换为onnx
import onnx
import torch
def export_onnx(onnx_model_path, model, cuda, height, width, dummy_input=None):
model.eval()
if dummy_input is None:
dummy_input = torch.randn(1, 3, height, width).float()
dummy_input.requires_grad = True
print("dummy_input shape: ", dummy_input.shape, dummy_input.requires_grad)
if cuda:
dummy_input = dummy_input.cuda()
torch.onnx.export(
model, # model being run
dummy_input, # model input (or a tuple for multiple inputs)
onnx_model_path, # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=10, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
verbose=True,
input_names=['input'], # the model's input names
output_names=['output'], # the model's output names
)
import tensorrt as trt
def build_engine(onnx_path):
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_batch_size = 128
builder.max_workspace_size = 1<<15
builder.fp16_mode = True
builder.strict_type_constraints = True
with open(onnx_path, 'rb') as model:
parser.parse(model.read())
# Build and return an engine.
return builder.build_cuda_engine(network)
#解析模型,构建engine并保存
with build_engine(onnx_path) as engine:
with open(engine_path, "wb") as f:
f.write(engine.serialize())
#直接加载engine
with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
#解析模型,构建engine并保存
with build_engine(uff_path) as engine:
with open(engine_path, "wb") as f:
f.write(engine.serialize())
#直接加载engine
with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())