Created
March 1, 2022 19:21
-
-
Save jramapuram/d284e0f261d3fdb15c213dd929d272b9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ViT( | |
| (patch_embed): PatchEmbed( | |
| (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16)) | |
| (norm): Identity() | |
| ) | |
| (backbone): xFormer( | |
| (encoders): ModuleList( | |
| (0): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (1): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (2): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (3): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (4): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (5): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (6): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (7): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (8): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (9): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (10): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (11): xFormerEncoderBlock( | |
| (mha): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| (feedforward): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (wrap_att): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MultiHeadDispatch( | |
| (attention): ScaledDotProduct( | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| (in_proj_container): InProjContainer() | |
| (resid_drop): DropPath() | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| ) | |
| ) | |
| ) | |
| (wrap_ff): Residual( | |
| (layer): PreNorm( | |
| (norm): FusedLayerNorm() | |
| (sublayer): MLP( | |
| (mlp): Sequential( | |
| (0): Linear(in_features=768, out_features=3072, bias=True) | |
| (1): GELU() | |
| (2): Dropout(p=0.0, inplace=False) | |
| (3): Linear(in_features=3072, out_features=768, bias=True) | |
| (4): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (decoders): ModuleList() | |
| ) | |
| (head): Sequential( | |
| (0): LayerNorm((768,), eps=1e-06, elementwise_affine=True) | |
| (1): Linear(in_features=768, out_features=1000, bias=True) | |
| ) | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment