Created
February 18, 2022 14:01
-
-
Save albertz/39813d93f2690b4c6d7347864a2b4b04 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from returnn.tf.util.data import Dim, batch_dim, single_step_dim, SpatialDim, FeatureDim | |
| use_tensorflow = True | |
| behavior_version = 12 | |
| time_dim = SpatialDim('time') | |
| input_dim = FeatureDim('input', 10) | |
| dummy_input_feature_dim = FeatureDim('dummy-input-feature-dim', 1) | |
| filter_dim0_dim = SpatialDim('filter-dim0', 3) | |
| filter_dim1_dim = SpatialDim('filter-dim1', 3) | |
| intermediate_out_sub_sample_dim = FeatureDim('intermediate_out_sub_sample', 14) | |
| conv_subsample_layer_out_spatial_dim0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim0') | |
| conv_subsample_layer_out_spatial_dim1_dim = SpatialDim('conv_subsample_layer:out-spatial-dim1', 5) | |
| filter_dim0_0_dim = SpatialDim('filter-dim0', 3) | |
| filter_dim1_0_dim = SpatialDim('filter-dim1', 3) | |
| out_dim = FeatureDim('out', 14) | |
| conv_subsample_layer_out_spatial_dim0_0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim0') | |
| conv_subsample_layer_out_spatial_dim1_0_dim = SpatialDim('conv_subsample_layer:out-spatial-dim1', 3) | |
| conv_subsample_layer_out_dim = SpatialDim('conv_subsample_layer:out_dim') | |
| ff_dim = FeatureDim('ff', 17) | |
| num_heads_dim = SpatialDim('num_heads', 2) | |
| layers_0_self_att_history_dim = SpatialDim('layers/0/self_att:history') | |
| filter_dim0_1_dim = SpatialDim('filter-dim0', 32) | |
| layers_1_self_att_history_dim = SpatialDim('layers/1/self_att:history') | |
| extern_data = { | |
| 'data': { | |
| 'dim_tags': ( | |
| batch_dim, | |
| time_dim, | |
| input_dim | |
| ), | |
| 'dtype': 'float32', | |
| 'available_for_inference': True | |
| } | |
| } | |
| network = { | |
| 'conv_subsample_layer': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'split_dims': { | |
| 'class': 'split_dims', | |
| 'from': 'base:data:data', | |
| 'axis': input_dim, | |
| 'dims': ( | |
| input_dim, | |
| dummy_input_feature_dim | |
| ), | |
| 'out_shape': {batch_dim, time_dim, input_dim, dummy_input_feature_dim} | |
| }, | |
| 'conv_layers.0': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| filter_dim0_dim, | |
| filter_dim1_dim, | |
| dummy_input_feature_dim, | |
| intermediate_out_sub_sample_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.21081851067789195, | |
| 'maxval': 0.21081851067789195, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'conv': { | |
| 'class': 'conv', | |
| 'from': 'base:split_dims', | |
| 'in_dim': dummy_input_feature_dim, | |
| 'in_spatial_dims': [ | |
| time_dim, | |
| input_dim | |
| ], | |
| 'out_dim': intermediate_out_sub_sample_dim, | |
| 'out_spatial_dims': [ | |
| time_dim, | |
| input_dim | |
| ], | |
| 'filter_size': [3, 3], | |
| 'padding': 'same', | |
| 'filter': 'filter', | |
| 'with_bias': True, | |
| 'bias': 'bias', | |
| 'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'conv', | |
| 'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim} | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| intermediate_out_sub_sample_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| }, | |
| 'filter': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| filter_dim0_dim, | |
| filter_dim1_dim, | |
| dummy_input_feature_dim, | |
| intermediate_out_sub_sample_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| } | |
| }, | |
| 'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim}, | |
| 'name_scope': 'conv_layers/0' | |
| }, | |
| 'relu': { | |
| 'class': 'activation', | |
| 'from': 'conv_layers.0/conv', | |
| 'activation': 'relu', | |
| 'out_shape': {batch_dim, time_dim, input_dim, intermediate_out_sub_sample_dim} | |
| }, | |
| 'pool': { | |
| 'class': 'pool', | |
| 'from': 'relu', | |
| 'mode': 'max', | |
| 'pool_size': (2, 2), | |
| 'padding': 'same', | |
| 'in_spatial_dims': [ | |
| time_dim, | |
| input_dim | |
| ], | |
| 'out_spatial_dims': [ | |
| conv_subsample_layer_out_spatial_dim0_dim, | |
| conv_subsample_layer_out_spatial_dim1_dim | |
| ], | |
| 'out_shape': {batch_dim, intermediate_out_sub_sample_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'pool', | |
| 'dropout': 0.1, | |
| 'dropout_axis': intermediate_out_sub_sample_dim, | |
| 'out_shape': {batch_dim, intermediate_out_sub_sample_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim} | |
| }, | |
| 'conv_layers.1': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| filter_dim0_0_dim, | |
| filter_dim1_0_dim, | |
| intermediate_out_sub_sample_dim, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.1543033499620919, | |
| 'maxval': 0.1543033499620919, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'conv': { | |
| 'class': 'conv', | |
| 'from': 'base:dropout_0', | |
| 'in_dim': intermediate_out_sub_sample_dim, | |
| 'in_spatial_dims': [ | |
| conv_subsample_layer_out_spatial_dim0_dim, | |
| conv_subsample_layer_out_spatial_dim1_dim | |
| ], | |
| 'out_dim': out_dim, | |
| 'out_spatial_dims': [ | |
| conv_subsample_layer_out_spatial_dim0_dim, | |
| conv_subsample_layer_out_spatial_dim1_dim | |
| ], | |
| 'filter_size': [3, 3], | |
| 'padding': 'same', | |
| 'filter': 'filter', | |
| 'with_bias': True, | |
| 'bias': 'bias', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'conv', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim} | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| }, | |
| 'filter': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| filter_dim0_0_dim, | |
| filter_dim1_0_dim, | |
| intermediate_out_sub_sample_dim, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim}, | |
| 'name_scope': 'conv_layers/1' | |
| }, | |
| 'relu_0': { | |
| 'class': 'activation', | |
| 'from': 'conv_layers.1/conv', | |
| 'activation': 'relu', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_spatial_dim0_dim, conv_subsample_layer_out_spatial_dim1_dim, out_dim} | |
| }, | |
| 'pool_0': { | |
| 'class': 'pool', | |
| 'from': 'relu_0', | |
| 'mode': 'max', | |
| 'pool_size': (2, 2), | |
| 'padding': 'same', | |
| 'in_spatial_dims': [ | |
| conv_subsample_layer_out_spatial_dim0_dim, | |
| conv_subsample_layer_out_spatial_dim1_dim | |
| ], | |
| 'out_spatial_dims': [ | |
| conv_subsample_layer_out_spatial_dim0_0_dim, | |
| conv_subsample_layer_out_spatial_dim1_0_dim | |
| ], | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_spatial_dim0_0_dim, conv_subsample_layer_out_spatial_dim1_0_dim} | |
| }, | |
| 'dropout_1': { | |
| 'class': 'dropout', | |
| 'from': 'pool_0', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_spatial_dim0_0_dim, conv_subsample_layer_out_spatial_dim1_0_dim} | |
| }, | |
| 'merge_dims': { | |
| 'class': 'merge_dims', | |
| 'from': 'dropout_1', | |
| 'axes': [ | |
| conv_subsample_layer_out_spatial_dim0_0_dim, | |
| conv_subsample_layer_out_spatial_dim1_0_dim | |
| ], | |
| 'out_dim': conv_subsample_layer_out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'merge_dims', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'linear': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim.copy(match_priority=1), | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.4629100498862757, | |
| 'maxval': 0.4629100498862757, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:conv_subsample_layer/merge_dims', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'dot', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim.copy(match_priority=1), | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'linear', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layers': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| '0': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'layer_norm': { | |
| 'class': 'layer_norm', | |
| 'from': 'base:base:dropout_0', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'ffn1': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'linear_ff': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| ff_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'swish': { | |
| 'class': 'activation', | |
| 'from': 'linear_ff', | |
| 'activation': 'swish', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'swish', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'linear_out': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| ff_dim, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:dropout_0', 'weight'], | |
| 'reduce': ff_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'linear_out', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'ffn1', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'constant': {'class': 'constant', 'value': 0.5}, | |
| 'mul': { | |
| 'class': 'combine', | |
| 'from': ['constant', 'dropout_0'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['mul', 'base:base:dropout_0'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_0': { | |
| 'class': 'layer_norm', | |
| 'from': 'add', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'self_att': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'qkv': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| 3 * out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.32732683535398854, | |
| 'maxval': 0.32732683535398854, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm_0', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| 3 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| 3 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'qkv_split_dims': { | |
| 'class': 'split_dims', | |
| 'from': 'qkv', | |
| 'axis': 3 * out_dim, | |
| 'dims': ( | |
| num_heads_dim, | |
| 3 * out_dim.div_left(num_heads_dim) | |
| ), | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'qkv_split': { | |
| 'class': 'split', | |
| 'from': 'qkv_split_dims', | |
| 'axis': 3 * out_dim.div_left(num_heads_dim), | |
| 'out_dims': ( | |
| out_dim.div_left(num_heads_dim), | |
| out_dim.div_left(num_heads_dim), | |
| out_dim.div_left(num_heads_dim) | |
| ), | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'k_new_dim': { | |
| 'class': 'reinterpret_data', | |
| 'set_dim_tags': { | |
| conv_subsample_layer_out_dim: layers_0_self_att_history_dim | |
| }, | |
| 'from': 'qkv_split/1', | |
| 'out_shape': {batch_dim, num_heads_dim, layers_0_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'v_new_dim': { | |
| 'class': 'reinterpret_data', | |
| 'set_dim_tags': { | |
| conv_subsample_layer_out_dim: layers_0_self_att_history_dim | |
| }, | |
| 'from': 'qkv_split/2', | |
| 'out_shape': {batch_dim, num_heads_dim, layers_0_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'dot_attention': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'constant': {'class': 'constant', 'value': 0.37796447300922725}, | |
| 'mul': { | |
| 'class': 'combine', | |
| 'from': ['base:qkv_split/0', 'constant'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'energy': { | |
| 'class': 'dot', | |
| 'from': ['mul', 'base:k_new_dim'], | |
| 'reduce': out_dim.div_left(num_heads_dim), | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim} | |
| }, | |
| 'att_weights': { | |
| 'class': 'softmax_over_spatial', | |
| 'from': 'energy', | |
| 'axis': layers_0_self_att_history_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim} | |
| }, | |
| 'dropout': { | |
| 'class': 'dropout', | |
| 'from': 'att_weights', | |
| 'dropout': 0.1, | |
| 'dropout_axis': layers_0_self_att_history_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_0_self_att_history_dim} | |
| }, | |
| 'att': { | |
| 'class': 'dot', | |
| 'from': ['dropout', 'base:v_new_dim'], | |
| 'reduce': layers_0_self_att_history_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'att', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'output_0': { | |
| 'class': 'merge_dims', | |
| 'from': 'dot_attention', | |
| 'axes': ( | |
| num_heads_dim, | |
| out_dim.div_left(num_heads_dim) | |
| ), | |
| 'out_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'output_0', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add_0': { | |
| 'class': 'combine', | |
| 'from': ['self_att', 'add'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_1': { | |
| 'class': 'layer_norm', | |
| 'from': 'add_0', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'conv_block': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'positionwise_conv1': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| 2 * out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.3779644730092272, | |
| 'maxval': 0.3779644730092272, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm_1', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| 2 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| 2 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'glu': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'split': { | |
| 'class': 'split', | |
| 'from': 'base:positionwise_conv1', | |
| 'axis': 2 * out_dim, | |
| 'out_dims': [ | |
| out_dim, | |
| out_dim | |
| ], | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'sigmoid': { | |
| 'class': 'activation', | |
| 'from': 'split/1', | |
| 'activation': 'sigmoid', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'mul': { | |
| 'class': 'combine', | |
| 'from': ['split/0', 'sigmoid'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'depthwise_conv': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| filter_dim0_1_dim, | |
| out_dim // 14, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.11180339887498948, | |
| 'maxval': 0.11180339887498948, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'conv': { | |
| 'class': 'conv', | |
| 'from': 'base:glu', | |
| 'in_dim': out_dim, | |
| 'in_spatial_dims': [ | |
| conv_subsample_layer_out_dim | |
| ], | |
| 'out_dim': out_dim, | |
| 'out_spatial_dims': [ | |
| conv_subsample_layer_out_dim | |
| ], | |
| 'filter_size': [32], | |
| 'padding': 'same', | |
| 'groups': 14, | |
| 'filter': 'filter', | |
| 'with_bias': True, | |
| 'bias': 'bias', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'conv', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| }, | |
| 'filter': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| filter_dim0_1_dim, | |
| out_dim // 14, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'norm': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'batch_norm': { | |
| 'class': 'batch_norm', | |
| 'from': 'base:depthwise_conv/conv', | |
| 'in_dim': out_dim, | |
| 'use_std': True, | |
| 'use_shift': True, | |
| 'param_version': 2, | |
| 'reuse_params': { | |
| 'map': { | |
| 'batch_norm/v2_mean': {'layer_output': 'running_mean'}, | |
| 'batch_norm/v2_variance': {'layer_output': 'running_variance'}, | |
| 'batch_norm/v2_gamma': {'layer_output': 'gamma'}, | |
| 'batch_norm/v2_beta': {'layer_output': 'beta'} | |
| } | |
| }, | |
| 'momentum': 0.1, | |
| 'epsilon': 0.001, | |
| 'masked_time': False, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'batch_norm', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'beta': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| }, | |
| 'gamma': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 1.0 | |
| }, | |
| 'running_mean': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'trainable': False, | |
| 'init': 0.0 | |
| }, | |
| 'running_variance': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'trainable': False, | |
| 'init': 1.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'swish': { | |
| 'class': 'activation', | |
| 'from': 'norm', | |
| 'activation': 'swish', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'positionwise_conv2': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim.copy(match_priority=1), | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.4629100498862757, | |
| 'maxval': 0.4629100498862757, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:swish', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim.copy(match_priority=1), | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'positionwise_conv2', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_1': { | |
| 'class': 'dropout', | |
| 'from': 'conv_block', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add_1': { | |
| 'class': 'combine', | |
| 'from': ['dropout_1', 'add_0'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_2': { | |
| 'class': 'layer_norm', | |
| 'from': 'add_1', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'ffn2': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'linear_ff': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| ff_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm_2', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'swish': { | |
| 'class': 'activation', | |
| 'from': 'linear_ff', | |
| 'activation': 'swish', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'swish', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'linear_out': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| ff_dim, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:dropout_0', 'weight'], | |
| 'reduce': ff_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'linear_out', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_2': { | |
| 'class': 'dropout', | |
| 'from': 'ffn2', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'constant_0': {'class': 'constant', 'value': 0.5}, | |
| 'mul_0': { | |
| 'class': 'combine', | |
| 'from': ['constant_0', 'dropout_2'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add_2': { | |
| 'class': 'combine', | |
| 'from': ['mul_0', 'add_1'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_3': { | |
| 'class': 'layer_norm', | |
| 'from': 'add_2', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'layer_norm_3', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| '1': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'layer_norm': { | |
| 'class': 'layer_norm', | |
| 'from': 'base:0', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'ffn1': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'linear_ff': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| ff_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'swish': { | |
| 'class': 'activation', | |
| 'from': 'linear_ff', | |
| 'activation': 'swish', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'swish', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'linear_out': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| ff_dim, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:dropout_0', 'weight'], | |
| 'reduce': ff_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'linear_out', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'ffn1', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'constant': {'class': 'constant', 'value': 0.5}, | |
| 'mul': { | |
| 'class': 'combine', | |
| 'from': ['constant', 'dropout_0'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['mul', 'base:0'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_0': { | |
| 'class': 'layer_norm', | |
| 'from': 'add', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'self_att': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'qkv': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| 3 * out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.32732683535398854, | |
| 'maxval': 0.32732683535398854, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm_0', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| 3 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| 3 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 3 * out_dim} | |
| }, | |
| 'qkv_split_dims': { | |
| 'class': 'split_dims', | |
| 'from': 'qkv', | |
| 'axis': 3 * out_dim, | |
| 'dims': ( | |
| num_heads_dim, | |
| 3 * out_dim.div_left(num_heads_dim) | |
| ), | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'qkv_split': { | |
| 'class': 'split', | |
| 'from': 'qkv_split_dims', | |
| 'axis': 3 * out_dim.div_left(num_heads_dim), | |
| 'out_dims': ( | |
| out_dim.div_left(num_heads_dim), | |
| out_dim.div_left(num_heads_dim), | |
| out_dim.div_left(num_heads_dim) | |
| ), | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, 3 * out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'k_new_dim': { | |
| 'class': 'reinterpret_data', | |
| 'set_dim_tags': { | |
| conv_subsample_layer_out_dim: layers_1_self_att_history_dim | |
| }, | |
| 'from': 'qkv_split/1', | |
| 'out_shape': {batch_dim, num_heads_dim, layers_1_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'v_new_dim': { | |
| 'class': 'reinterpret_data', | |
| 'set_dim_tags': { | |
| conv_subsample_layer_out_dim: layers_1_self_att_history_dim | |
| }, | |
| 'from': 'qkv_split/2', | |
| 'out_shape': {batch_dim, num_heads_dim, layers_1_self_att_history_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'dot_attention': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'constant': {'class': 'constant', 'value': 0.37796447300922725}, | |
| 'mul': { | |
| 'class': 'combine', | |
| 'from': ['base:qkv_split/0', 'constant'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'energy': { | |
| 'class': 'dot', | |
| 'from': ['mul', 'base:k_new_dim'], | |
| 'reduce': out_dim.div_left(num_heads_dim), | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim} | |
| }, | |
| 'att_weights': { | |
| 'class': 'softmax_over_spatial', | |
| 'from': 'energy', | |
| 'axis': layers_1_self_att_history_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim} | |
| }, | |
| 'dropout': { | |
| 'class': 'dropout', | |
| 'from': 'att_weights', | |
| 'dropout': 0.1, | |
| 'dropout_axis': layers_1_self_att_history_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, layers_1_self_att_history_dim} | |
| }, | |
| 'att': { | |
| 'class': 'dot', | |
| 'from': ['dropout', 'base:v_new_dim'], | |
| 'reduce': layers_1_self_att_history_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'att', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, num_heads_dim, out_dim.div_left(num_heads_dim)} | |
| }, | |
| 'output_0': { | |
| 'class': 'merge_dims', | |
| 'from': 'dot_attention', | |
| 'axes': ( | |
| num_heads_dim, | |
| out_dim.div_left(num_heads_dim) | |
| ), | |
| 'out_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'output_0', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add_0': { | |
| 'class': 'combine', | |
| 'from': ['self_att', 'add'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_1': { | |
| 'class': 'layer_norm', | |
| 'from': 'add_0', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'conv_block': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'positionwise_conv1': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| 2 * out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.3779644730092272, | |
| 'maxval': 0.3779644730092272, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm_1', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| 2 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| 2 * out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'glu': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'split': { | |
| 'class': 'split', | |
| 'from': 'base:positionwise_conv1', | |
| 'axis': 2 * out_dim, | |
| 'out_dims': [ | |
| out_dim, | |
| out_dim | |
| ], | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, 2 * out_dim} | |
| }, | |
| 'sigmoid': { | |
| 'class': 'activation', | |
| 'from': 'split/1', | |
| 'activation': 'sigmoid', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'mul': { | |
| 'class': 'combine', | |
| 'from': ['split/0', 'sigmoid'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'depthwise_conv': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| filter_dim0_1_dim, | |
| out_dim // 14, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.11180339887498948, | |
| 'maxval': 0.11180339887498948, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'conv': { | |
| 'class': 'conv', | |
| 'from': 'base:glu', | |
| 'in_dim': out_dim, | |
| 'in_spatial_dims': [ | |
| conv_subsample_layer_out_dim | |
| ], | |
| 'out_dim': out_dim, | |
| 'out_spatial_dims': [ | |
| conv_subsample_layer_out_dim | |
| ], | |
| 'filter_size': [32], | |
| 'padding': 'same', | |
| 'groups': 14, | |
| 'filter': 'filter', | |
| 'with_bias': True, | |
| 'bias': 'bias', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'conv', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| }, | |
| 'filter': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| filter_dim0_1_dim, | |
| out_dim // 14, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'norm': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'batch_norm': { | |
| 'class': 'batch_norm', | |
| 'from': 'base:depthwise_conv/conv', | |
| 'in_dim': out_dim, | |
| 'use_std': True, | |
| 'use_shift': True, | |
| 'param_version': 2, | |
| 'reuse_params': { | |
| 'map': { | |
| 'batch_norm/v2_mean': {'layer_output': 'running_mean'}, | |
| 'batch_norm/v2_variance': {'layer_output': 'running_variance'}, | |
| 'batch_norm/v2_gamma': {'layer_output': 'gamma'}, | |
| 'batch_norm/v2_beta': {'layer_output': 'beta'} | |
| } | |
| }, | |
| 'momentum': 0.1, | |
| 'epsilon': 0.001, | |
| 'masked_time': False, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'batch_norm', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'beta': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| }, | |
| 'gamma': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 1.0 | |
| }, | |
| 'running_mean': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'trainable': False, | |
| 'init': 0.0 | |
| }, | |
| 'running_variance': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'trainable': False, | |
| 'init': 1.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'swish': { | |
| 'class': 'activation', | |
| 'from': 'norm', | |
| 'activation': 'swish', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'positionwise_conv2': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim.copy(match_priority=1), | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.4629100498862757, | |
| 'maxval': 0.4629100498862757, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:swish', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim.copy(match_priority=1), | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'positionwise_conv2', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_1': { | |
| 'class': 'dropout', | |
| 'from': 'conv_block', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add_1': { | |
| 'class': 'combine', | |
| 'from': ['dropout_1', 'add_0'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_2': { | |
| 'class': 'layer_norm', | |
| 'from': 'add_1', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'ffn2': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'linear_ff': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| out_dim, | |
| ff_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:base:layer_norm_2', 'weight'], | |
| 'reduce': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim, | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'swish': { | |
| 'class': 'activation', | |
| 'from': 'linear_ff', | |
| 'activation': 'swish', | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'dropout_0': { | |
| 'class': 'dropout', | |
| 'from': 'swish', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, conv_subsample_layer_out_dim, ff_dim} | |
| }, | |
| 'linear_out': { | |
| 'class': 'subnetwork', | |
| 'from': [], | |
| 'subnetwork': { | |
| 'random': { | |
| 'class': 'random', | |
| 'shape': ( | |
| ff_dim, | |
| out_dim | |
| ), | |
| 'distribution': 'uniform', | |
| 'minval': -0.43994134506405985, | |
| 'maxval': 0.43994134506405985, | |
| 'dtype': 'float32', | |
| 'static': True | |
| }, | |
| 'dot': { | |
| 'class': 'dot', | |
| 'from': ['base:dropout_0', 'weight'], | |
| 'reduce': ff_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add': { | |
| 'class': 'combine', | |
| 'from': ['dot', 'bias'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'weight': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| ff_dim, | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init_by_layer': 'random' | |
| }, | |
| 'bias': { | |
| 'class': 'variable', | |
| 'shape': [ | |
| out_dim | |
| ], | |
| 'param_name': 'param', | |
| 'init': 0.0 | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'linear_out', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'dropout_2': { | |
| 'class': 'dropout', | |
| 'from': 'ffn2', | |
| 'dropout': 0.1, | |
| 'dropout_axis': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'constant_0': {'class': 'constant', 'value': 0.5}, | |
| 'mul_0': { | |
| 'class': 'combine', | |
| 'from': ['constant_0', 'dropout_2'], | |
| 'kind': 'mul', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'add_2': { | |
| 'class': 'combine', | |
| 'from': ['mul_0', 'add_1'], | |
| 'kind': 'add', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'layer_norm_3': { | |
| 'class': 'layer_norm', | |
| 'from': 'add_2', | |
| 'in_dim': out_dim, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'layer_norm_3', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': '1', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| }, | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| }, | |
| 'output': { | |
| 'class': 'copy', | |
| 'from': 'layers', | |
| 'out_shape': {batch_dim, out_dim, conv_subsample_layer_out_dim} | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment