Created
January 8, 2025 14:22
-
-
Save nan-wang/430c2b86f07675304d6c401a3cddfe6a to your computer and use it in GitHub Desktop.
inspect_into_modernbert_tokenizer.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyM32Pf1T/gYmX06lwSM61Wl", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"f8d50d3263e94aa48d49b6df2c51d7e3": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HBoxModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HBoxView", | |
"box_style": "", | |
"children": [ | |
"IPY_MODEL_bf53f11922ae42eb995fcb00170fb27f", | |
"IPY_MODEL_c9d846ed39d4491cac3daef637e0a498", | |
"IPY_MODEL_ace914fcaf864e16bef11a5f97ce626c" | |
], | |
"layout": "IPY_MODEL_6b771547d7524857bbc0854bc62fa85f" | |
} | |
}, | |
"bf53f11922ae42eb995fcb00170fb27f": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HTMLModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HTMLView", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_663d82b5194149848cd22895d024ea7c", | |
"placeholder": "", | |
"style": "IPY_MODEL_361f137127b34ea3986e42ba05926cb6", | |
"value": "tokenizer_config.json: 100%" | |
} | |
}, | |
"c9d846ed39d4491cac3daef637e0a498": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "FloatProgressModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "ProgressView", | |
"bar_style": "success", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_760b75b6da344c1d83ea4362668a22bb", | |
"max": 20837, | |
"min": 0, | |
"orientation": "horizontal", | |
"style": "IPY_MODEL_e8d97ad24bb74bdf97bd2a21e17a2bb0", | |
"value": 20837 | |
} | |
}, | |
"ace914fcaf864e16bef11a5f97ce626c": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HTMLModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HTMLView", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_37085e6939734a5fa888b31f602b9b97", | |
"placeholder": "", | |
"style": "IPY_MODEL_40453e191280454eb70bce4b8250264c", | |
"value": " 20.8k/20.8k [00:00<00:00, 1.29MB/s]" | |
} | |
}, | |
"6b771547d7524857bbc0854bc62fa85f": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"663d82b5194149848cd22895d024ea7c": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"361f137127b34ea3986e42ba05926cb6": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "DescriptionStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"description_width": "" | |
} | |
}, | |
"760b75b6da344c1d83ea4362668a22bb": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"e8d97ad24bb74bdf97bd2a21e17a2bb0": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "ProgressStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"bar_color": null, | |
"description_width": "" | |
} | |
}, | |
"37085e6939734a5fa888b31f602b9b97": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"40453e191280454eb70bce4b8250264c": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "DescriptionStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"description_width": "" | |
} | |
}, | |
"5ea7b7bbb88c468e9ba06bd82c48c097": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HBoxModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HBoxView", | |
"box_style": "", | |
"children": [ | |
"IPY_MODEL_02210c97d2cb481884448fa21a72c230", | |
"IPY_MODEL_ae1417b2543d4ba7b22390464478d169", | |
"IPY_MODEL_b9ebb699dfec405ca772cc7ad114647c" | |
], | |
"layout": "IPY_MODEL_e3540f80ed9e4c01bad81112f15617d4" | |
} | |
}, | |
"02210c97d2cb481884448fa21a72c230": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HTMLModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HTMLView", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_40a913ac87ad4522ac5cd13c3f3643c8", | |
"placeholder": "", | |
"style": "IPY_MODEL_98f36fd893a642a3803c7ebf05753fd3", | |
"value": "tokenizer.json: 100%" | |
} | |
}, | |
"ae1417b2543d4ba7b22390464478d169": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "FloatProgressModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "ProgressView", | |
"bar_style": "success", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_8dca7f9d463c4ecdb0727f8e01724888", | |
"max": 2132967, | |
"min": 0, | |
"orientation": "horizontal", | |
"style": "IPY_MODEL_2351dc4374dd42ca950e003f6725bdb2", | |
"value": 2132967 | |
} | |
}, | |
"b9ebb699dfec405ca772cc7ad114647c": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HTMLModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HTMLView", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_7c4930fd56a547f38a4a3435a37a14b8", | |
"placeholder": "", | |
"style": "IPY_MODEL_c37f11f5e33141b49c9a0ab549ac048d", | |
"value": " 2.13M/2.13M [00:00<00:00, 25.1MB/s]" | |
} | |
}, | |
"e3540f80ed9e4c01bad81112f15617d4": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"40a913ac87ad4522ac5cd13c3f3643c8": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"98f36fd893a642a3803c7ebf05753fd3": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "DescriptionStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"description_width": "" | |
} | |
}, | |
"8dca7f9d463c4ecdb0727f8e01724888": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"2351dc4374dd42ca950e003f6725bdb2": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "ProgressStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"bar_color": null, | |
"description_width": "" | |
} | |
}, | |
"7c4930fd56a547f38a4a3435a37a14b8": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"c37f11f5e33141b49c9a0ab549ac048d": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "DescriptionStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"description_width": "" | |
} | |
}, | |
"dd4f732e14ee44fa96df3246d5da7753": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HBoxModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HBoxView", | |
"box_style": "", | |
"children": [ | |
"IPY_MODEL_e7262952b2274905b99d2535ebbb0ea9", | |
"IPY_MODEL_7c5105f8a2774b5a9bc42e5f2f392e5e", | |
"IPY_MODEL_9f96506d464e4dc080e907b2ba2c9a40" | |
], | |
"layout": "IPY_MODEL_9102425f1c624ab5bc3cdf7468a43c69" | |
} | |
}, | |
"e7262952b2274905b99d2535ebbb0ea9": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HTMLModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HTMLView", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_dcab4e74b4204273bea33b1560751b36", | |
"placeholder": "", | |
"style": "IPY_MODEL_f441f34ad9bc46b18786ff7761d8e593", | |
"value": "special_tokens_map.json: 100%" | |
} | |
}, | |
"7c5105f8a2774b5a9bc42e5f2f392e5e": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "FloatProgressModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "ProgressView", | |
"bar_style": "success", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_861578f5651646b887d4cb0a4f408cbd", | |
"max": 694, | |
"min": 0, | |
"orientation": "horizontal", | |
"style": "IPY_MODEL_913fade1dce94ce787c453b33870b343", | |
"value": 694 | |
} | |
}, | |
"9f96506d464e4dc080e907b2ba2c9a40": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "HTMLModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "HTMLView", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_7bdda64602ce4b62848b877ab243c25d", | |
"placeholder": "", | |
"style": "IPY_MODEL_ccc8f3103c444c98a236e2d385e5d758", | |
"value": " 694/694 [00:00<00:00, 34.5kB/s]" | |
} | |
}, | |
"9102425f1c624ab5bc3cdf7468a43c69": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"dcab4e74b4204273bea33b1560751b36": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"f441f34ad9bc46b18786ff7761d8e593": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "DescriptionStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"description_width": "" | |
} | |
}, | |
"861578f5651646b887d4cb0a4f408cbd": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"913fade1dce94ce787c453b33870b343": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "ProgressStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"bar_color": null, | |
"description_width": "" | |
} | |
}, | |
"7bdda64602ce4b62848b877ab243c25d": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": null | |
} | |
}, | |
"ccc8f3103c444c98a236e2d385e5d758": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "DescriptionStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"description_width": "" | |
} | |
} | |
} | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/nan-wang/430c2b86f07675304d6c401a3cddfe6a/inspect_into_modernbert_tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "iCAXSlNU9bPo", | |
"outputId": "a71f6cfa-607d-4dd1-f960-24225ab5a68a" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting git+https://github.com/huggingface/transformers.git\n", | |
" Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-t53_uf7x\n", | |
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-t53_uf7x\n", | |
" Resolved https://github.com/huggingface/transformers.git to commit 59e5b3f01b7773439671c3a827348ba87dc8b92a\n", | |
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (3.16.1)\n", | |
"Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.27.0)\n", | |
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (1.26.4)\n", | |
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (24.2)\n", | |
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (6.0.2)\n", | |
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2024.11.6)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2.32.3)\n", | |
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.21.0)\n", | |
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.4.5)\n", | |
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (4.67.1)\n", | |
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (2024.10.0)\n", | |
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (4.12.2)\n", | |
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.4.0)\n", | |
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.10)\n", | |
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2.2.3)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2024.12.14)\n", | |
"Building wheels for collected packages: transformers\n", | |
" Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10331157 sha256=8f6cebb74c7ece1c7260b8ad5307cbaab5a6daea3db3b51ae83b80f2623b3d16\n", | |
" Stored in directory: /tmp/pip-ephem-wheel-cache-4o2183us/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16\n", | |
"Successfully built transformers\n", | |
"Installing collected packages: transformers\n", | |
" Attempting uninstall: transformers\n", | |
" Found existing installation: transformers 4.47.1\n", | |
" Uninstalling transformers-4.47.1:\n", | |
" Successfully uninstalled transformers-4.47.1\n", | |
"Successfully installed transformers-4.48.0.dev0\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install git+https://github.com/huggingface/transformers.git" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import re\n", | |
"from transformers import AutoTokenizer\n" | |
], | |
"metadata": { | |
"id": "WboGIDSrAiGW" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"patterns = [\n", | |
" # Single-line comments (must have content after the symbol)\n", | |
" re.compile(r'^\\s*(#\\S.*|//\\S.*|--\\S.*|;\\S.*)'), # Python, C/C++, Java, JavaScript, SQL, Matlab\n", | |
" # Multi-line comments (must have content inside the comment block)\n", | |
" re.compile(r'^\\s*(/\\*.*\\*/|\\'\\'\\'[^\\'\\r\\n]*\\'\\'\\'|\\\"\\\"\\\"[^\\\"\\\\r\\\\n]*\\\"\\\"\\\")'), # C/C++, Java, JavaScript, Go, Rust, Python\n", | |
"]\n", | |
"\n", | |
"def is_comment(line):\n", | |
" # Check each compiled pattern for whether it matches the line\n", | |
" for pattern in patterns:\n", | |
" if pattern.match(line):\n", | |
" return True\n", | |
" return False\n" | |
], | |
"metadata": { | |
"id": "xotzCAcL-ttS" | |
}, | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"voc_mbert = AutoTokenizer.from_pretrained(\"answerdotai/ModernBERT-large\").get_vocab()\n", | |
"\n", | |
"for k, index in voc_mbert.items():\n", | |
" if is_comment(k):\n", | |
" print(f\"{index}: {repr(k)}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000, | |
"referenced_widgets": [ | |
"f8d50d3263e94aa48d49b6df2c51d7e3", | |
"bf53f11922ae42eb995fcb00170fb27f", | |
"c9d846ed39d4491cac3daef637e0a498", | |
"ace914fcaf864e16bef11a5f97ce626c", | |
"6b771547d7524857bbc0854bc62fa85f", | |
"663d82b5194149848cd22895d024ea7c", | |
"361f137127b34ea3986e42ba05926cb6", | |
"760b75b6da344c1d83ea4362668a22bb", | |
"e8d97ad24bb74bdf97bd2a21e17a2bb0", | |
"37085e6939734a5fa888b31f602b9b97", | |
"40453e191280454eb70bce4b8250264c", | |
"5ea7b7bbb88c468e9ba06bd82c48c097", | |
"02210c97d2cb481884448fa21a72c230", | |
"ae1417b2543d4ba7b22390464478d169", | |
"b9ebb699dfec405ca772cc7ad114647c", | |
"e3540f80ed9e4c01bad81112f15617d4", | |
"40a913ac87ad4522ac5cd13c3f3643c8", | |
"98f36fd893a642a3803c7ebf05753fd3", | |
"8dca7f9d463c4ecdb0727f8e01724888", | |
"2351dc4374dd42ca950e003f6725bdb2", | |
"7c4930fd56a547f38a4a3435a37a14b8", | |
"c37f11f5e33141b49c9a0ab549ac048d", | |
"dd4f732e14ee44fa96df3246d5da7753", | |
"e7262952b2274905b99d2535ebbb0ea9", | |
"7c5105f8a2774b5a9bc42e5f2f392e5e", | |
"9f96506d464e4dc080e907b2ba2c9a40", | |
"9102425f1c624ab5bc3cdf7468a43c69", | |
"dcab4e74b4204273bea33b1560751b36", | |
"f441f34ad9bc46b18786ff7761d8e593", | |
"861578f5651646b887d4cb0a4f408cbd", | |
"913fade1dce94ce787c453b33870b343", | |
"7bdda64602ce4b62848b877ab243c25d", | |
"ccc8f3103c444c98a236e2d385e5d758" | |
] | |
}, | |
"id": "1D2vvr1aAMRa", | |
"outputId": "efa6356f-599b-4019-efd2-1e672dbca904" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"tokenizer_config.json: 0%| | 0.00/20.8k [00:00<?, ?B/s]" | |
], | |
"application/vnd.jupyter.widget-view+json": { | |
"version_major": 2, | |
"version_minor": 0, | |
"model_id": "f8d50d3263e94aa48d49b6df2c51d7e3" | |
} | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"tokenizer.json: 0%| | 0.00/2.13M [00:00<?, ?B/s]" | |
], | |
"application/vnd.jupyter.widget-view+json": { | |
"version_major": 2, | |
"version_minor": 0, | |
"model_id": "5ea7b7bbb88c468e9ba06bd82c48c097" | |
} | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"special_tokens_map.json: 0%| | 0.00/694 [00:00<?, ?B/s]" | |
], | |
"application/vnd.jupyter.widget-view+json": { | |
"version_major": 2, | |
"version_minor": 0, | |
"model_id": "dd4f732e14ee44fa96df3246d5da7753" | |
} | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"30863: ';\"><'\n", | |
"39485: '-------------------------------------------'\n", | |
"44136: '-------------------------------------------------'\n", | |
"12723: ';;'\n", | |
"29295: '----------------------------'\n", | |
"5146: '######'\n", | |
"19628: '--------------------------------------------------------------------------------'\n", | |
"45263: '--------------------------------------------------'\n", | |
"15623: '---|'\n", | |
"48924: '//----------------------------------------------------------------'\n", | |
"397: '--------'\n", | |
"2518: '----------------------------------------------------------------'\n", | |
"22902: '################################'\n", | |
"28693: '------------------------------'\n", | |
"9794: '---------'\n", | |
"34194: ';&#'\n", | |
"20744: ';\\\\;\\\\'\n", | |
"22873: '---------------------'\n", | |
"45599: ';|'\n", | |
"26577: '#.'\n", | |
"10326: '#:'\n", | |
"1835: '####'\n", | |
"22158: '-->'\n", | |
"29234: ';_'\n", | |
"29648: '-----------------------------'\n", | |
"26077: ';<'\n", | |
"1013: '--------------------------------'\n", | |
"32657: ';;;;'\n", | |
"28505: '------------------------------------------------------------------------------------------------'\n", | |
"39423: '----------------------------------------------------------------------------------------------------------------'\n", | |
"22928: '------------------'\n", | |
"24702: '-------------------------'\n", | |
"42277: '--;'\n", | |
"11311: '---------------'\n", | |
"16352: ';\"'\n", | |
"42451: '---------------------------------------------'\n", | |
"35349: '--------------------------------------'\n", | |
"10521: '--------------'\n", | |
"9962: '----------'\n", | |
"43657: ';{\\\\'\n", | |
"2917: '////'\n", | |
"37041: '------------------------------------------------------------------------'\n", | |
"48151: '-------------------------------------------------------'\n", | |
"39481: '//!'\n", | |
"22866: ';}'\n", | |
"817: '##'\n", | |
"47332: '----------------------------------------------------'\n", | |
"44391: '#{$'\n", | |
"16525: '----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'\n", | |
"4118: '###'\n", | |
"47726: \"#'\"\n", | |
"30624: ';/'\n", | |
"315: '----'\n", | |
"40904: '--------------------------------------------'\n", | |
"36311: '---------------------------------------'\n", | |
"2912: '------------'\n", | |
"26043: '------------------------'\n", | |
"7405: ';\\\\'\n", | |
"35000: '-------------------------------------'\n", | |
"26836: '--\"'\n", | |
"23380: '-----------------'\n", | |
"23796: '-------------------'\n", | |
"47584: '---------------------------------------------------'\n", | |
"30282: '---------------------------------'\n", | |
"6675: '########'\n", | |
"9998: '-----------'\n", | |
"9032: '////////////////'\n", | |
"43067: '----------------------------------------------------------------------'\n", | |
"50001: '--['\n", | |
"33585: '-----------------------------------'\n", | |
"38944: '-----------------------------------------'\n", | |
"13011: ';\">'\n", | |
"1532: '---'\n", | |
"6154: '------------------------------------------------'\n", | |
"34638: '------------------------------------'\n", | |
"6846: '-------'\n", | |
"10428: '-------------'\n", | |
"7040: '-----'\n", | |
"573: '----------------'\n", | |
"13309: ';&'\n", | |
"22002: '#####'\n", | |
"16985: '////////////////////////////////'\n", | |
"5071: '////////'\n", | |
"7078: '--------------------------------------------------------------------------------------------------------------------------------'\n", | |
"25916: '-----------------------'\n", | |
"48904: '------------------------------------------------------'\n", | |
"36739: ';,'\n", | |
"28388: '---------------------------'\n", | |
"47632: '---|---|---'\n", | |
"39421: '------------------------------------------'\n", | |
"4485: '------'\n", | |
"36960: '----------------------------------------'\n", | |
"43500: '-----------------------------------------------'\n", | |
"13143: ';</'\n", | |
"42040: '----------------------------------------------'\n", | |
"28511: ';\\\\;'\n", | |
"27396: '#,'\n", | |
"15879: '--------------------'\n", | |
"33250: '----------------------------------'\n", | |
"27800: '--------------------------'\n", | |
"33301: '--**'\n", | |
"32107: '-------------------------------'\n", | |
"23130: '----------------------'\n", | |
"11890: '################'\n", | |
"37446: '////////////////////////////////////////////////////////////////'\n", | |
"10638: '///'\n", | |
"20782: '---|---'\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"voc_roberta = AutoTokenizer.from_pretrained('roberta-large').get_vocab()\n", | |
"\n", | |
"for k, index in voc_roberta.items():\n", | |
" if is_comment(k):\n", | |
" print(f\"{index}: {repr(k)}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "u-EMCfnaAQtx", | |
"outputId": "2f200689-9827-4170-9d41-a7abf6b1d170" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"49216: '////////////////////////////////'\n", | |
"31175: '---------------'\n", | |
"46580: '------------------------'\n", | |
"49308: '----------------------------------------------------------------'\n", | |
"49104: '///'\n", | |
"49198: '--------'\n", | |
"48803: ';}'\n", | |
"45072: '----------'\n", | |
"49625: '-->'\n", | |
"50012: ';;;;'\n", | |
"49283: '////////'\n", | |
"49296: '////////////////'\n", | |
"46156: '------------'\n", | |
"44516: '------'\n", | |
"49599: '--------------------------------------------------------'\n", | |
"49909: ';;;;;;;;'\n", | |
"47655: '--------------------'\n", | |
"49629: '####'\n", | |
"49183: '-----------'\n", | |
"48342: '##'\n", | |
"48712: '-------'\n", | |
"44259: '----'\n", | |
"39550: '-------------'\n", | |
"47826: '---------'\n", | |
"46939: '--------------------------------'\n", | |
"41110: '--------------'\n", | |
"48134: '###'\n", | |
"50072: ';;;;;;;;;;;;'\n", | |
"49255: '#$'\n", | |
"49674: '################################'\n", | |
"49727: '########'\n", | |
"46343: '-----'\n", | |
"48900: '////'\n", | |
"42777: ';\"'\n", | |
"49806: '################'\n", | |
"48640: ';;'\n", | |
"50065: '#$#$'\n", | |
"24965: '---'\n", | |
"24524: '----------------'\n", | |
"49374: '------------------------------------------------'\n", | |
"49903: '--+'\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment