Skip to content

Instantly share code, notes, and snippets.

@nan-wang
Created January 8, 2025 14:22
Show Gist options
  • Save nan-wang/430c2b86f07675304d6c401a3cddfe6a to your computer and use it in GitHub Desktop.
Save nan-wang/430c2b86f07675304d6c401a3cddfe6a to your computer and use it in GitHub Desktop.
inspect_into_modernbert_tokenizer.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyM32Pf1T/gYmX06lwSM61Wl",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"f8d50d3263e94aa48d49b6df2c51d7e3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_bf53f11922ae42eb995fcb00170fb27f",
"IPY_MODEL_c9d846ed39d4491cac3daef637e0a498",
"IPY_MODEL_ace914fcaf864e16bef11a5f97ce626c"
],
"layout": "IPY_MODEL_6b771547d7524857bbc0854bc62fa85f"
}
},
"bf53f11922ae42eb995fcb00170fb27f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_663d82b5194149848cd22895d024ea7c",
"placeholder": "​",
"style": "IPY_MODEL_361f137127b34ea3986e42ba05926cb6",
"value": "tokenizer_config.json: 100%"
}
},
"c9d846ed39d4491cac3daef637e0a498": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_760b75b6da344c1d83ea4362668a22bb",
"max": 20837,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_e8d97ad24bb74bdf97bd2a21e17a2bb0",
"value": 20837
}
},
"ace914fcaf864e16bef11a5f97ce626c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_37085e6939734a5fa888b31f602b9b97",
"placeholder": "​",
"style": "IPY_MODEL_40453e191280454eb70bce4b8250264c",
"value": " 20.8k/20.8k [00:00<00:00, 1.29MB/s]"
}
},
"6b771547d7524857bbc0854bc62fa85f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"663d82b5194149848cd22895d024ea7c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"361f137127b34ea3986e42ba05926cb6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"760b75b6da344c1d83ea4362668a22bb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e8d97ad24bb74bdf97bd2a21e17a2bb0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"37085e6939734a5fa888b31f602b9b97": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"40453e191280454eb70bce4b8250264c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"5ea7b7bbb88c468e9ba06bd82c48c097": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_02210c97d2cb481884448fa21a72c230",
"IPY_MODEL_ae1417b2543d4ba7b22390464478d169",
"IPY_MODEL_b9ebb699dfec405ca772cc7ad114647c"
],
"layout": "IPY_MODEL_e3540f80ed9e4c01bad81112f15617d4"
}
},
"02210c97d2cb481884448fa21a72c230": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_40a913ac87ad4522ac5cd13c3f3643c8",
"placeholder": "​",
"style": "IPY_MODEL_98f36fd893a642a3803c7ebf05753fd3",
"value": "tokenizer.json: 100%"
}
},
"ae1417b2543d4ba7b22390464478d169": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8dca7f9d463c4ecdb0727f8e01724888",
"max": 2132967,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_2351dc4374dd42ca950e003f6725bdb2",
"value": 2132967
}
},
"b9ebb699dfec405ca772cc7ad114647c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_7c4930fd56a547f38a4a3435a37a14b8",
"placeholder": "​",
"style": "IPY_MODEL_c37f11f5e33141b49c9a0ab549ac048d",
"value": " 2.13M/2.13M [00:00<00:00, 25.1MB/s]"
}
},
"e3540f80ed9e4c01bad81112f15617d4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"40a913ac87ad4522ac5cd13c3f3643c8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"98f36fd893a642a3803c7ebf05753fd3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"8dca7f9d463c4ecdb0727f8e01724888": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2351dc4374dd42ca950e003f6725bdb2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"7c4930fd56a547f38a4a3435a37a14b8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c37f11f5e33141b49c9a0ab549ac048d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"dd4f732e14ee44fa96df3246d5da7753": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_e7262952b2274905b99d2535ebbb0ea9",
"IPY_MODEL_7c5105f8a2774b5a9bc42e5f2f392e5e",
"IPY_MODEL_9f96506d464e4dc080e907b2ba2c9a40"
],
"layout": "IPY_MODEL_9102425f1c624ab5bc3cdf7468a43c69"
}
},
"e7262952b2274905b99d2535ebbb0ea9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_dcab4e74b4204273bea33b1560751b36",
"placeholder": "​",
"style": "IPY_MODEL_f441f34ad9bc46b18786ff7761d8e593",
"value": "special_tokens_map.json: 100%"
}
},
"7c5105f8a2774b5a9bc42e5f2f392e5e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_861578f5651646b887d4cb0a4f408cbd",
"max": 694,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_913fade1dce94ce787c453b33870b343",
"value": 694
}
},
"9f96506d464e4dc080e907b2ba2c9a40": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_7bdda64602ce4b62848b877ab243c25d",
"placeholder": "​",
"style": "IPY_MODEL_ccc8f3103c444c98a236e2d385e5d758",
"value": " 694/694 [00:00<00:00, 34.5kB/s]"
}
},
"9102425f1c624ab5bc3cdf7468a43c69": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"dcab4e74b4204273bea33b1560751b36": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f441f34ad9bc46b18786ff7761d8e593": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"861578f5651646b887d4cb0a4f408cbd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"913fade1dce94ce787c453b33870b343": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"7bdda64602ce4b62848b877ab243c25d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ccc8f3103c444c98a236e2d385e5d758": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/nan-wang/430c2b86f07675304d6c401a3cddfe6a/inspect_into_modernbert_tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iCAXSlNU9bPo",
"outputId": "a71f6cfa-607d-4dd1-f960-24225ab5a68a"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting git+https://github.com/huggingface/transformers.git\n",
" Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-t53_uf7x\n",
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-t53_uf7x\n",
" Resolved https://github.com/huggingface/transformers.git to commit 59e5b3f01b7773439671c3a827348ba87dc8b92a\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (3.16.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.27.0)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (1.26.4)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2024.11.6)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2.32.3)\n",
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.21.0)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.4.5)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (4.67.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (2024.10.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (4.12.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.4.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2024.12.14)\n",
"Building wheels for collected packages: transformers\n",
" Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10331157 sha256=8f6cebb74c7ece1c7260b8ad5307cbaab5a6daea3db3b51ae83b80f2623b3d16\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-4o2183us/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16\n",
"Successfully built transformers\n",
"Installing collected packages: transformers\n",
" Attempting uninstall: transformers\n",
" Found existing installation: transformers 4.47.1\n",
" Uninstalling transformers-4.47.1:\n",
" Successfully uninstalled transformers-4.47.1\n",
"Successfully installed transformers-4.48.0.dev0\n"
]
}
],
"source": [
"!pip install git+https://github.com/huggingface/transformers.git"
]
},
{
"cell_type": "code",
"source": [
"import re\n",
"from transformers import AutoTokenizer\n"
],
"metadata": {
"id": "WboGIDSrAiGW"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"patterns = [\n",
" # Single-line comments (must have content after the symbol)\n",
" re.compile(r'^\\s*(#\\S.*|//\\S.*|--\\S.*|;\\S.*)'), # Python, C/C++, Java, JavaScript, SQL, Matlab\n",
" # Multi-line comments (must have content inside the comment block)\n",
" re.compile(r'^\\s*(/\\*.*\\*/|\\'\\'\\'[^\\'\\r\\n]*\\'\\'\\'|\\\"\\\"\\\"[^\\\"\\\\r\\\\n]*\\\"\\\"\\\")'), # C/C++, Java, JavaScript, Go, Rust, Python\n",
"]\n",
"\n",
"def is_comment(line):\n",
" # Check each compiled pattern for whether it matches the line\n",
" for pattern in patterns:\n",
" if pattern.match(line):\n",
" return True\n",
" return False\n"
],
"metadata": {
"id": "xotzCAcL-ttS"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"voc_mbert = AutoTokenizer.from_pretrained(\"answerdotai/ModernBERT-large\").get_vocab()\n",
"\n",
"for k, index in voc_mbert.items():\n",
" if is_comment(k):\n",
" print(f\"{index}: {repr(k)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"f8d50d3263e94aa48d49b6df2c51d7e3",
"bf53f11922ae42eb995fcb00170fb27f",
"c9d846ed39d4491cac3daef637e0a498",
"ace914fcaf864e16bef11a5f97ce626c",
"6b771547d7524857bbc0854bc62fa85f",
"663d82b5194149848cd22895d024ea7c",
"361f137127b34ea3986e42ba05926cb6",
"760b75b6da344c1d83ea4362668a22bb",
"e8d97ad24bb74bdf97bd2a21e17a2bb0",
"37085e6939734a5fa888b31f602b9b97",
"40453e191280454eb70bce4b8250264c",
"5ea7b7bbb88c468e9ba06bd82c48c097",
"02210c97d2cb481884448fa21a72c230",
"ae1417b2543d4ba7b22390464478d169",
"b9ebb699dfec405ca772cc7ad114647c",
"e3540f80ed9e4c01bad81112f15617d4",
"40a913ac87ad4522ac5cd13c3f3643c8",
"98f36fd893a642a3803c7ebf05753fd3",
"8dca7f9d463c4ecdb0727f8e01724888",
"2351dc4374dd42ca950e003f6725bdb2",
"7c4930fd56a547f38a4a3435a37a14b8",
"c37f11f5e33141b49c9a0ab549ac048d",
"dd4f732e14ee44fa96df3246d5da7753",
"e7262952b2274905b99d2535ebbb0ea9",
"7c5105f8a2774b5a9bc42e5f2f392e5e",
"9f96506d464e4dc080e907b2ba2c9a40",
"9102425f1c624ab5bc3cdf7468a43c69",
"dcab4e74b4204273bea33b1560751b36",
"f441f34ad9bc46b18786ff7761d8e593",
"861578f5651646b887d4cb0a4f408cbd",
"913fade1dce94ce787c453b33870b343",
"7bdda64602ce4b62848b877ab243c25d",
"ccc8f3103c444c98a236e2d385e5d758"
]
},
"id": "1D2vvr1aAMRa",
"outputId": "efa6356f-599b-4019-efd2-1e672dbca904"
},
"execution_count": 7,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/20.8k [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "f8d50d3263e94aa48d49b6df2c51d7e3"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer.json: 0%| | 0.00/2.13M [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "5ea7b7bbb88c468e9ba06bd82c48c097"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/694 [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "dd4f732e14ee44fa96df3246d5da7753"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"30863: ';\"><'\n",
"39485: '-------------------------------------------'\n",
"44136: '-------------------------------------------------'\n",
"12723: ';;'\n",
"29295: '----------------------------'\n",
"5146: '######'\n",
"19628: '--------------------------------------------------------------------------------'\n",
"45263: '--------------------------------------------------'\n",
"15623: '---|'\n",
"48924: '//----------------------------------------------------------------'\n",
"397: '--------'\n",
"2518: '----------------------------------------------------------------'\n",
"22902: '################################'\n",
"28693: '------------------------------'\n",
"9794: '---------'\n",
"34194: ';&#'\n",
"20744: ';\\\\;\\\\'\n",
"22873: '---------------------'\n",
"45599: ';|'\n",
"26577: '#.'\n",
"10326: '#:'\n",
"1835: '####'\n",
"22158: '-->'\n",
"29234: ';_'\n",
"29648: '-----------------------------'\n",
"26077: ';<'\n",
"1013: '--------------------------------'\n",
"32657: ';;;;'\n",
"28505: '------------------------------------------------------------------------------------------------'\n",
"39423: '----------------------------------------------------------------------------------------------------------------'\n",
"22928: '------------------'\n",
"24702: '-------------------------'\n",
"42277: '--;'\n",
"11311: '---------------'\n",
"16352: ';\"'\n",
"42451: '---------------------------------------------'\n",
"35349: '--------------------------------------'\n",
"10521: '--------------'\n",
"9962: '----------'\n",
"43657: ';{\\\\'\n",
"2917: '////'\n",
"37041: '------------------------------------------------------------------------'\n",
"48151: '-------------------------------------------------------'\n",
"39481: '//!'\n",
"22866: ';}'\n",
"817: '##'\n",
"47332: '----------------------------------------------------'\n",
"44391: '#{$'\n",
"16525: '----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'\n",
"4118: '###'\n",
"47726: \"#'\"\n",
"30624: ';/'\n",
"315: '----'\n",
"40904: '--------------------------------------------'\n",
"36311: '---------------------------------------'\n",
"2912: '------------'\n",
"26043: '------------------------'\n",
"7405: ';\\\\'\n",
"35000: '-------------------------------------'\n",
"26836: '--\"'\n",
"23380: '-----------------'\n",
"23796: '-------------------'\n",
"47584: '---------------------------------------------------'\n",
"30282: '---------------------------------'\n",
"6675: '########'\n",
"9998: '-----------'\n",
"9032: '////////////////'\n",
"43067: '----------------------------------------------------------------------'\n",
"50001: '--['\n",
"33585: '-----------------------------------'\n",
"38944: '-----------------------------------------'\n",
"13011: ';\">'\n",
"1532: '---'\n",
"6154: '------------------------------------------------'\n",
"34638: '------------------------------------'\n",
"6846: '-------'\n",
"10428: '-------------'\n",
"7040: '-----'\n",
"573: '----------------'\n",
"13309: ';&'\n",
"22002: '#####'\n",
"16985: '////////////////////////////////'\n",
"5071: '////////'\n",
"7078: '--------------------------------------------------------------------------------------------------------------------------------'\n",
"25916: '-----------------------'\n",
"48904: '------------------------------------------------------'\n",
"36739: ';,'\n",
"28388: '---------------------------'\n",
"47632: '---|---|---'\n",
"39421: '------------------------------------------'\n",
"4485: '------'\n",
"36960: '----------------------------------------'\n",
"43500: '-----------------------------------------------'\n",
"13143: ';</'\n",
"42040: '----------------------------------------------'\n",
"28511: ';\\\\;'\n",
"27396: '#,'\n",
"15879: '--------------------'\n",
"33250: '----------------------------------'\n",
"27800: '--------------------------'\n",
"33301: '--**'\n",
"32107: '-------------------------------'\n",
"23130: '----------------------'\n",
"11890: '################'\n",
"37446: '////////////////////////////////////////////////////////////////'\n",
"10638: '///'\n",
"20782: '---|---'\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"voc_roberta = AutoTokenizer.from_pretrained('roberta-large').get_vocab()\n",
"\n",
"for k, index in voc_roberta.items():\n",
" if is_comment(k):\n",
" print(f\"{index}: {repr(k)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "u-EMCfnaAQtx",
"outputId": "2f200689-9827-4170-9d41-a7abf6b1d170"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"49216: '////////////////////////////////'\n",
"31175: '---------------'\n",
"46580: '------------------------'\n",
"49308: '----------------------------------------------------------------'\n",
"49104: '///'\n",
"49198: '--------'\n",
"48803: ';}'\n",
"45072: '----------'\n",
"49625: '-->'\n",
"50012: ';;;;'\n",
"49283: '////////'\n",
"49296: '////////////////'\n",
"46156: '------------'\n",
"44516: '------'\n",
"49599: '--------------------------------------------------------'\n",
"49909: ';;;;;;;;'\n",
"47655: '--------------------'\n",
"49629: '####'\n",
"49183: '-----------'\n",
"48342: '##'\n",
"48712: '-------'\n",
"44259: '----'\n",
"39550: '-------------'\n",
"47826: '---------'\n",
"46939: '--------------------------------'\n",
"41110: '--------------'\n",
"48134: '###'\n",
"50072: ';;;;;;;;;;;;'\n",
"49255: '#$'\n",
"49674: '################################'\n",
"49727: '########'\n",
"46343: '-----'\n",
"48900: '////'\n",
"42777: ';\"'\n",
"49806: '################'\n",
"48640: ';;'\n",
"50065: '#$#$'\n",
"24965: '---'\n",
"24524: '----------------'\n",
"49374: '------------------------------------------------'\n",
"49903: '--+'\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment