前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >SkeyeRTSPLive高效转码之SkeyeVideoDecoder采用Nvidia独立显卡高效硬件解码解决方案(附源码)(2)

SkeyeRTSPLive高效转码之SkeyeVideoDecoder采用Nvidia独立显卡高效硬件解码解决方案(附源码)(2)

原创
作者头像
Openskeye
发布2023-04-23 14:17:43
4000
发布2023-04-23 14:17:43
举报
文章被收录于专栏:国标视频云平台

在我之前写的一篇文章《SkeyeRTSPLive传统视频监控互联网+实现利器解决方案》中提到RTSP转RTMP的转流过程,简化流程就是通过SkeyeRTSPClient拉RTSP流,获取音视频编码数据,然后再通过SkeyeRTMPPusher推出去,流程非常简单;然后再实际开发过程中,我们发现其实这个过程并没有想象中那么简单;首先,RTSP协议支持多种音视频编码格式,如音频支持AAC,G711,G726,等,视频支持H264,H625,MJPEG, MPEG等等各种格式,而SkeyeRTMPPusher推流只支持H264(已扩展支持H265)格式,这时,音频我们可以通过SkeyeAACEncoder将音频转码成AAC格式,而视频我们可以通过SkeyeVideoDecoder解码成原始数据,然后再通过SkeyeVideoEncoder将原始数据转码成RTMP推送指定的格式,本文,我们将重点讲述SkeyeVideoDecoder基于Nvidia(英伟达)独立显卡的解码流程。

SkeyeVideoDecoder基Nvidia独立显卡的硬解码库SkeyeNvDecoder

SkeyeNvDecoder库是基于Nvidia独立显卡驱动的硬件解码程序,该解码程序效率非常高效且具有强大的并行解码效能力,其解码效率比ffmpeg软件解码效率提到至少5-6倍,最新的RTX系列显卡其解码效率甚至比软解码高10-12倍,轻松解码多路4K乃至8K高清视频无压力,本文采用的是截止目前(20190714)最新的显卡驱动,CUDA版本需要10.0或者以上版本支持。

1. 接口声明如下:
代码语言:txt
复制
#ifndef SKEYENVDECODERAPI_H
#define SKEYENVDECODERAPI_H

#include <string>

//++ typedefine start
#ifndef SKEYENVDECODER_HANDLE
#define SKEYENVDECODER_HANDLE void*
#endif//SKEYENVDECODER_HANDLE

typedef enum _OutputFormat //native=默认解码器输出为NV12格式
{
	native = 0, bgrp, rgbp, bgra, rgba, bgra64, rgba64
}OutputFormat;


typedef enum _SKEYENvDecoder_CodecType {
	SKEYENvDecoder_Codec_MPEG1 = 0,                                         /**<  MPEG1             */
	SKEYENvDecoder_Codec_MPEG2,                                           /**<  MPEG2             */
	SKEYENvDecoder_Codec_MPEG4,                                           /**<  MPEG4             */
	SKEYENvDecoder_Codec_VC1,                                             /**<  VC1               */
	SKEYENvDecoder_Codec_H264,                                            /**<  H264              */
	SKEYENvDecoder_Codec_JPEG,                                            /**<  JPEG              */
	SKEYENvDecoder_Codec_H264_SVC,                                        /**<  H264-SVC          */
	SKEYENvDecoder_Codec_H264_MVC,                                        /**<  H264-MVC          */
	SKEYENvDecoder_Codec_HEVC,                                            /**<  HEVC              */
	SKEYENvDecoder_Codec_VP8,                                             /**<  VP8               */
	SKEYENvDecoder_Codec_VP9,                                             /**<  VP9               */
	SKEYENvDecoder_Codec_NumCodecs,                                       /**<  Max codecs        */
} SKEYENvDecoder_CodecType;

typedef enum _SKEYENvDecoder_YUVType {

	// Uncompressed YUV
	SKEYENvDecoder_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')),   /**< Y,U,V (4:2:0)      */
	SKEYENvDecoder_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,V,U (4:2:0)      */
	SKEYENvDecoder_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,UV  (4:2:0)      */
	SKEYENvDecoder_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')),   /**< YUYV/YUY2 (4:2:2)  */
	SKEYENvDecoder_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y'))    /**< UYVY (4:2:2)       */
} SKEYENvDecoder_YUVType;

#ifdef __cplusplus
extern "C"
{
#endif

int SKEYENvDecoder_Initsize(std::string &erroStr);

//除非使用低延迟模式,否则请不要使用此标志bLowLatency,但是使用此标志很难获得硬件解码器100%的利用率。
SKEYENVDECODER_HANDLE NvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, std::string &erroStr);
int NvDecoder_Decode(NVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned);
void SKEYENvDecoder_Release(NVDECODER_HANDLE handle) ;
int NvDecoder_Uninitsize();


#ifdef __cplusplus
}
#endif

#endif // SKEYENVDECODERAPI_H
2. SkeyeNvDecoder解码库调用流程
  • 第一步,初始化注册解码器 注意,注册解码器函数全局只需调用一;int SKEYENvDecoder_Initsize(string &erroStr) { try { if (!isInitsized) { //显卡只初始化一次 ck(cuInit(0)); int nGpu = 0; ck(cuDeviceGetCount(&nGpu)); for (int i = 0; i < nGpu; i++) { CUdevice cuDevice = 0; ck(cuDeviceGet(&cuDevice, i)); char szDeviceName[128]; ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice)); LOG(INFO) << "Find Gpu: " << szDeviceName << std::endl; CUcontext cuContext = NULL; ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice)); m_ctxV.push_back({ cuContext,szDeviceName }); } isInitsized = true; m_curIndex = 0; } if (m_ctxV.empty()) { return -1; } } catch (const std::exception& ex) { erroStr = ex.what(); std::cout << ex.what(); return -2; } return 1; }
  • 第二步,创建解码器实例
代码语言:txt
复制
SKEYENVDECODER_HANDLE SKEYENvDecoder_Create(NvDecoder_CodecType codec, int videoW, int videoH, bool bLowLatency, OutputFormat eOutputFormat, int& errCode, string &erroStr)
{
	//if (!isInitsized || !m_ctxV.size()) {
	//	return NULL;
	//}

	try {
		ck(cuInit(0));
		int nGpu = 0;
		ck(cuDeviceGetCount(&nGpu));
		CUcontext cuContext = NULL;
		m_curIndex++;
		m_curIndex = (m_curIndex) % nGpu;
		for (int i = 0; i < nGpu; i++)
		{
			if (m_curIndex == i)
			{
				CUdevice cuDevice = 0;
				ck(cuDeviceGet(&cuDevice, i));
				char szDeviceName[128];
				ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
				LOG(INFO) << "Find Gpu: " << szDeviceName << std::endl;
				ck(cuCtxCreate(&cuContext, CU_CTX_SCHED_BLOCKING_SYNC, cuDevice));
			}
		}
		//std::pair<CUcontext, std::string> &v = m_ctxV.at(m_curIndex++ % m_ctxV.size());
		//std::cout << "Use Contex in " << v.second << std::endl;

		const char *aszChromaFormat[] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
		cudaVideoCodec aeCodec[] = { cudaVideoCodec_JPEG, cudaVideoCodec_MPEG1, cudaVideoCodec_MPEG2, cudaVideoCodec_MPEG4, cudaVideoCodec_H264, cudaVideoCodec_HEVC,
			cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_HEVC, cudaVideoCodec_VC1, cudaVideoCodec_VP8,
			cudaVideoCodec_VP9, cudaVideoCodec_VP9, cudaVideoCodec_VP9 };
		int anBitDepthMinus8[] = { 0, 0, 0, 0, 0, 0, 2, 4, 0, 2, 4, 0, 0, 0, 2, 4 };
		cudaVideoChromaFormat aeChromaFormat[] = { cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420,
			cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_444, cudaVideoChromaFormat_444,
			cudaVideoChromaFormat_444, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420, cudaVideoChromaFormat_420 };

		CUVIDDECODECAPS videoDecodeCaps = {};
		videoDecodeCaps.eCodecType = (cudaVideoCodec)codec;
		videoDecodeCaps.eChromaFormat = cudaVideoChromaFormat_420;
		videoDecodeCaps.nBitDepthMinus8 = 0;
		for (int i = 0; i < sizeof(aeCodec) / sizeof(aeCodec[0]); i++) 
		{
			if (aeCodec[i] == codec)
			{
				videoDecodeCaps.eChromaFormat = aeChromaFormat[i];
				videoDecodeCaps.nBitDepthMinus8 = anBitDepthMinus8[i];
				break;
			}
		}
		errCode = cuvidGetDecoderCaps(&videoDecodeCaps);

		if (CUDA_SUCCESS == errCode) { //判断显卡是否支持1080p解码
			LOG(INFO) << "cuvid Decoder Caps nMaxWidth " << videoDecodeCaps.nMaxWidth << " nMaxHeigth " << videoDecodeCaps.nMaxHeight << std::endl;
			if (!videoDecodeCaps.bIsSupported) {
				erroStr = "Codec not supported on this GPU Decoder";
				errCode = -1;
			}
			else
			{
				//判断是否支持指定格式分辨率视频解码
				if (videoDecodeCaps.nMaxWidth >= videoW && videoDecodeCaps.nMaxHeight >= videoH)
				{
					NvDecoder* pDecoder = new NvDecoder(/*v.first*/cuContext, videoW, videoH, eOutputFormat== native?false:true,
						(cudaVideoCodec)codec, NULL, bLowLatency, eOutputFormat);
					pDecoder->Start();
					return pDecoder;
				}
				else
				{
					erroStr = "Width and height not supported on this GPU Decoder";
					errCode = -2;
				}
			}
		}
	}
	catch (std::exception &e)
	{
		erroStr = e.what();
	}
	return NULL;
}
  • 第三步,调用解码函数解码
代码语言:txt
复制
int SKEYENvDecoder_Decode(SKEYENVDECODER_HANDLE handle, const uint8_t *pData, int nSize, uint8_t ***pppFrame, int* pnFrameLen, int *pnFrameReturned)
{
	if (!handle)
		return -1;
	NvDecoder* pDecoder = (NvDecoder*)handle;

	int anSize[] = { 0, 3, 3, 4, 4, 8, 8 };
	//std::unique_ptr<uint8_t[]> pImage(new uint8_t[nFrameSize]);
	std::vector<uint8_t *>* vecOutBuffer = pDecoder->GetFrameBufferVector();
	size_t nFrameSize = pDecoder->GetOutFrameSize();
	*pnFrameLen = nFrameSize;

	int nFrameReturned = 0, nFrame = 0;
	uint8_t **ppFrame = NULL;

	bool bLowLatency = pDecoder->IsSetLowLatency();
	bool bSuc = pDecoder->Decode(pData, nSize, &ppFrame, &nFrameReturned, CUVID_PKT_ENDOFPICTURE/*bLowLatency?CUVID_PKT_ENDOFPICTURE : 0*/);
	if (!bSuc)
		return -2;
	//if (!nFrame && nFrameReturned > 0)
	//LOG(INFO) << "nFrameReturned = " <<nFrameReturned;//pDecoder->GetVideoInfo();

	for (int i = 0; i < nFrameReturned; i++)
	{
		if (native != pDecoder->GetSetOutputFormat())
		{
			if (i >= (*vecOutBuffer).size())
			{
				(*vecOutBuffer).push_back(new uint8_t[nFrameSize]);
			}
		}

		if (pDecoder->GetBitDepth() == 8) 
		{
			switch (pDecoder->GetSetOutputFormat()) 
			{
			case native:
				//GetImage((CUdeviceptr)ppFrame[i], (*vecOutBuffer)[i], pDecoder->GetWidth(), pDecoder->GetHeight() + (pDecoder->GetChromaHeight() * pDecoder->GetNumChromaPlanes()));
				break;
			case bgrp:
				if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
					YUV444ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				else
					Nv12ToColorPlanar<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight());
				break;
			case rgbp:
				if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
					YUV444ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				else
					Nv12ToColorPlanar<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], pDecoder->GetWidth(), 3 * pDecoder->GetHeight());
				break;
			case bgra:
				if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
					YUV444ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				else
					Nv12ToColor32<BGRA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight());
				break;
			case rgba:
				if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
					YUV444ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				else
					Nv12ToColor32<RGBA32>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 4 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 4 * pDecoder->GetWidth(), pDecoder->GetHeight());
				break;
			case bgra64:
				if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
					YUV444ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				else
					Nv12ToColor64<BGRA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight());
				break;
			case rgba64:
				if (pDecoder->GetOutputFormat() == cudaVideoSurfaceFormat_YUV444)
					YUV444ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				else
					Nv12ToColor64<RGBA64>((uint8_t *)ppFrame[i], pDecoder->GetWidth(), (uint8_t*)pDecoder->GetDeviceImagePtr(), 8 * pDecoder->GetWidth(), pDecoder->GetWidth(), pDecoder->GetHeight());
				GetImage(pDecoder->GetDeviceImagePtr(), (*vecOutBuffer)[i], 8 * pDecoder->GetWidth(), pDecoder->GetHeight());
				break;
			}
		}
	}
	nFrame += nFrameReturned;

	if (nFrameReturned > 0)
	{
		if (pnFrameReturned)
			*pnFrameReturned = nFrameReturned;
		if (native != pDecoder->GetSetOutputFormat())
		{
			if (pppFrame && (*vecOutBuffer).size() > 0)
				*pppFrame = &(*vecOutBuffer)[0];
		}
		else
		{
			if (pppFrame && ppFrame)
				*pppFrame = ppFrame;	
		}
	}
}
  • 第四步,停止解码,销毁解码器
代码语言:txt
复制
void SKEYENvDecoder_Release(SKEYENVDECODER_HANDLE handle)
{
	if (!handle)
		return;
	NvDecoder* pDecoder = (NvDecoder*)handle;	
	pDecoder->Stop();
	delete pDecoder;
	m_curIndex--;
	if (m_curIndex < 0)
		m_curIndex = 0;
}
  • 第五步,注销解码器,释放资源
代码语言:txt
复制
int SKEYENvDecoder_Uninitsize()
{
	isInitsized = false;
	for (int nI = 0; nI < m_ctxV.size(); nI++)
	{
		cuCtxDestroy(m_ctxV[nI].first);
	}
	m_ctxV.clear();
	m_curIndex = 0;
	return 1;
}

自此,SKEYENvDecoder的封装就完成了,我们可以通过其接口调用Nvidia的显卡进行硬件解码测试, 以下为真实应用效果,硬解12路效果图cpu I5占比11,730显卡点75-80,如下图所示:

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • SkeyeVideoDecoder基Nvidia独立显卡的硬解码库SkeyeNvDecoder
    • 1. 接口声明如下:
      • 2. SkeyeNvDecoder解码库调用流程
      相关产品与服务
      GPU 云服务器
      GPU 云服务器(Cloud GPU Service,GPU)是提供 GPU 算力的弹性计算服务,具有超强的并行计算能力,作为 IaaS 层的尖兵利器,服务于生成式AI,自动驾驶,深度学习训练、科学计算、图形图像处理、视频编解码等场景。腾讯云随时提供触手可得的算力,有效缓解您的计算压力,提升业务效率与竞争力。
      领券
      问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档