小胖有技能|AssemblyAI 在 PyTorch 中建立端到端的语音识别模型,利用( 三 )


小胖有技能|AssemblyAI 在 PyTorch 中建立端到端的语音识别模型,利用
文章图片
递归神经网络(RNN)擅长处理序列建模问题 。 RNN会逐步处理音频特征 , 在使用前一帧的上下文的同时对每一帧进行预测 。 我们使用BiRNN是因为我们不仅需要每个步骤之前框架的上下文 , 还希望得到它之后框架的上下文 。
这可以帮助模型做出更好的预测 , 因为音频中的每一帧在进行预测之前都会有更多信息 。 我们使用RNN的门控递归单元(GRU)变种 , 因为它比LSTM需要的的计算资源更少 , 并且在某些情况下工作效果也一样 。
该模型为输出字符的概率矩阵 , 我们将使用该矩阵将其输入到解码器中 , 提取模型认为是概率最高的字符 。
classCNNLayerNorm(nn.Module):''''''Layernormalizationbuiltforcnnsinput''''''def__init__(self,n_feats):super(CNNLayerNorm,self).__init__self.layer_norm=nn.LayerNorm(n_feats)defforward(self,x):#x(batch,channel,feature,time)x=x.transpose(2,3).contiguous#(batch,channel,time,feature)x=self.layer_norm(x)returnx.transpose(2,3).contiguous#(batch,channel,feature,time)classResidualCNN(nn.Module):''''''ResidualCNNinspiredbyhttps://arxiv.org/pdf/1603.05027.pdfexceptwithlayernorminsteadofbatchnorm''''''def__init__(self,in_channels,out_channels,kernel,stride,dropout,n_feats):super(ResidualCNN,self).__init__self.cnn1=nn.Conv2d(in_channels,out_channels,kernel,stride,padding=kernel//2)self.cnn2=nn.Conv2d(out_channels,out_channels,kernel,stride,padding=kernel//2)self.dropout1=nn.Dropout(dropout)self.dropout2=nn.Dropout(dropout)self.layer_norm1=CNNLayerNorm(n_feats)self.layer_norm2=CNNLayerNorm(n_feats)defforward(self,x):residual=x#(batch,channel,feature,time)x=self.layer_norm1(x)x=F.gelu(x)x=self.dropout1(x)x=self.cnn1(x)x=self.layer_norm2(x)x=F.gelu(x)x=self.dropout2(x)x=self.cnn2(x)x+=residualreturnx#(batch,channel,feature,time)classBidirectionalGRU(nn.Module):def__init__(self,rnn_dim,hidden_size,dropout,batch_first):super(BidirectionalGRU,self).__init__self.BiGRU=nn.GRU(input_size=rnn_dim,hidden_size=hidden_size,num_layers=1,batch_first=batch_first,bidirectional=True)self.layer_norm=nn.LayerNorm(rnn_dim)self.dropout=nn.Dropout(dropout)defforward(self,x):x=self.layer_norm(x)x=F.gelu(x)x,_=self.BiGRU(x)x=self.dropout(x)returnxclassSpeechRecognitionModel(nn.Module):''''''SpeechRecognitionModelInspiredbyDeepSpeech2''''''def__init__(self,n_cnn_layers,n_rnn_layers,rnn_dim,n_class,n_feats,stride=2,dropout=0.1):super(SpeechRecognitionModel,self).__init__n_feats=n_feats//2self.cnn=nn.Conv2d(1,32,3,stride=stride,padding=3//2)#cnnforextractingheirachalfeatures#nresidualcnnlayerswithfiltersizeof32self.rescnn_layers=nn.Sequential(*[ResidualCNN(32,32,kernel=3,stride=1,dropout=dropout,n_feats=n_feats)for_inrange(n_cnn_layers)])self.fully_connected=nn.Linear(n_feats*32,rnn_dim)self.birnn_layers=nn.Sequential(*[BidirectionalGRU(rnn_dim=rnn_dimifi==0elsernn_dim*2,hidden_size=rnn_dim,dropout=dropout,batch_first=i==0)foriinrange(n_rnn_layers)])self.classifier=nn.Sequential(nn.Linear(rnn_dim*2,rnn_dim),#birnnreturnsrnn_dim*2nn.GELU,nn.Dropout(dropout),nn.Linear(rnn_dim,n_class))defforward(self,x):x=self.cnn(x)x=self.rescnn_layers(x)sizes=x.sizex=x.view(sizes[0],sizes[1]*sizes[2],sizes[3])#(batch,feature,time)x=x.transpose(1,2)#(batch,time,feature)x=self.fully_connected(x)x=self.birnn_layers(x)x=self.classifier(x)returnx


推荐阅读