Created
January 7, 2025 04:32
-
-
Save nan-wang/6a7ed3881c8941e82f93153278e46225 to your computer and use it in GitHub Desktop.
inspect_into_modernbert_tokenizer.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyPdSUmlYiedNetqIv6cvAXY", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/nan-wang/6a7ed3881c8941e82f93153278e46225/inspect_into_modernbert_tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "iCAXSlNU9bPo", | |
"outputId": "1a930d88-794c-46f5-83f9-111107921e9a" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting git+https://github.com/huggingface/transformers.git\n", | |
" Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-1r4_1umv\n", | |
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-1r4_1umv\n", | |
" Resolved https://github.com/huggingface/transformers.git to commit 12ba96aa3cb3e4ed2a3ffb77b59f53f8ce9ac1fa\n", | |
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (3.16.1)\n", | |
"Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.27.0)\n", | |
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (1.26.4)\n", | |
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (24.2)\n", | |
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (6.0.2)\n", | |
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2024.11.6)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2.32.3)\n", | |
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.21.0)\n", | |
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.4.5)\n", | |
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (4.67.1)\n", | |
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (2024.10.0)\n", | |
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (4.12.2)\n", | |
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.4.0)\n", | |
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.10)\n", | |
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2.2.3)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2024.12.14)\n", | |
"Building wheels for collected packages: transformers\n", | |
" Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10291798 sha256=45952e471b259c2bc576e1c85d7bcffe065daea1b9586868e0bd1d6fd000b533\n", | |
" Stored in directory: /tmp/pip-ephem-wheel-cache-x7ro2qtm/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16\n", | |
"Successfully built transformers\n", | |
"Installing collected packages: transformers\n", | |
" Attempting uninstall: transformers\n", | |
" Found existing installation: transformers 4.47.1\n", | |
" Uninstalling transformers-4.47.1:\n", | |
" Successfully uninstalled transformers-4.47.1\n", | |
"Successfully installed transformers-4.48.0.dev0\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install git+https://github.com/huggingface/transformers.git" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import re\n", | |
"from transformers import AutoTokenizer\n" | |
], | |
"metadata": { | |
"id": "WboGIDSrAiGW" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"patterns = [\n", | |
" # Single-line comments (must have content after the symbol)\n", | |
" re.compile(r'^\\s*(#\\S.*|//\\S.*|--\\S.*|;\\S.*)'), # Python, C/C++, Java, JavaScript, SQL, Matlab\n", | |
" # Multi-line comments (must have content inside the comment block)\n", | |
" re.compile(r'^\\s*(/\\*.*\\*/|\\'\\'\\'[^\\'\\r\\n]*\\'\\'\\'|\\\"\\\"\\\"[^\\\"\\\\r\\\\n]*\\\"\\\"\\\")'), # C/C++, Java, JavaScript, Go, Rust, Python\n", | |
"]\n", | |
"\n", | |
"def is_comment(line):\n", | |
" # Check each compiled pattern for whether it matches the line\n", | |
" for pattern in patterns:\n", | |
" if pattern.match(line):\n", | |
" return True\n", | |
" return False\n" | |
], | |
"metadata": { | |
"id": "xotzCAcL-ttS" | |
}, | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"voc_mbert = AutoTokenizer.from_pretrained(\"answerdotai/ModernBERT-large\").get_vocab()\n", | |
"\n", | |
"for k, index in voc_mbert.items():\n", | |
" if is_comment(k):\n", | |
" print(f\"{index}: {repr(k)}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "1D2vvr1aAMRa", | |
"outputId": "b58a608b-5a00-47c6-c96d-916d6471aa50" | |
}, | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"2912: '------------'\n", | |
"23796: '-------------------'\n", | |
"23380: '-----------------'\n", | |
"42277: '--;'\n", | |
"9032: '////////////////'\n", | |
"15623: '---|'\n", | |
"39485: '-------------------------------------------'\n", | |
"30624: ';/'\n", | |
"28693: '------------------------------'\n", | |
"39481: '//!'\n", | |
"42040: '----------------------------------------------'\n", | |
"6154: '------------------------------------------------'\n", | |
"43500: '-----------------------------------------------'\n", | |
"10521: '--------------'\n", | |
"39423: '----------------------------------------------------------------------------------------------------------------'\n", | |
"573: '----------------'\n", | |
"1532: '---'\n", | |
"32107: '-------------------------------'\n", | |
"26043: '------------------------'\n", | |
"22873: '---------------------'\n", | |
"16985: '////////////////////////////////'\n", | |
"13309: ';&'\n", | |
"37446: '////////////////////////////////////////////////////////////////'\n", | |
"36739: ';,'\n", | |
"29648: '-----------------------------'\n", | |
"4485: '------'\n", | |
"44136: '-------------------------------------------------'\n", | |
"28511: ';\\\\;'\n", | |
"35000: '-------------------------------------'\n", | |
"48151: '-------------------------------------------------------'\n", | |
"397: '--------'\n", | |
"45599: ';|'\n", | |
"10326: '#:'\n", | |
"6846: '-------'\n", | |
"38944: '-----------------------------------------'\n", | |
"5146: '######'\n", | |
"9962: '----------'\n", | |
"44391: '#{$'\n", | |
"33585: '-----------------------------------'\n", | |
"20782: '---|---'\n", | |
"22928: '------------------'\n", | |
"34638: '------------------------------------'\n", | |
"26836: '--\"'\n", | |
"27396: '#,'\n", | |
"26577: '#.'\n", | |
"39421: '------------------------------------------'\n", | |
"2917: '////'\n", | |
"36311: '---------------------------------------'\n", | |
"47726: \"#'\"\n", | |
"25916: '-----------------------'\n", | |
"16352: ';\"'\n", | |
"36960: '----------------------------------------'\n", | |
"42451: '---------------------------------------------'\n", | |
"13143: ';</'\n", | |
"37041: '------------------------------------------------------------------------'\n", | |
"27800: '--------------------------'\n", | |
"35349: '--------------------------------------'\n", | |
"47584: '---------------------------------------------------'\n", | |
"7405: ';\\\\'\n", | |
"45263: '--------------------------------------------------'\n", | |
"47332: '----------------------------------------------------'\n", | |
"48904: '------------------------------------------------------'\n", | |
"6675: '########'\n", | |
"30863: ';\"><'\n", | |
"23130: '----------------------'\n", | |
"22158: '-->'\n", | |
"1835: '####'\n", | |
"29295: '----------------------------'\n", | |
"12723: ';;'\n", | |
"28505: '------------------------------------------------------------------------------------------------'\n", | |
"22002: '#####'\n", | |
"43657: ';{\\\\'\n", | |
"29234: ';_'\n", | |
"32657: ';;;;'\n", | |
"16525: '----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'\n", | |
"10638: '///'\n", | |
"13011: ';\">'\n", | |
"817: '##'\n", | |
"22902: '################################'\n", | |
"7078: '--------------------------------------------------------------------------------------------------------------------------------'\n", | |
"50001: '--['\n", | |
"9794: '---------'\n", | |
"47632: '---|---|---'\n", | |
"48924: '//----------------------------------------------------------------'\n", | |
"26077: ';<'\n", | |
"11890: '################'\n", | |
"19628: '--------------------------------------------------------------------------------'\n", | |
"33301: '--**'\n", | |
"7040: '-----'\n", | |
"5071: '////////'\n", | |
"33250: '----------------------------------'\n", | |
"43067: '----------------------------------------------------------------------'\n", | |
"10428: '-------------'\n", | |
"28388: '---------------------------'\n", | |
"1013: '--------------------------------'\n", | |
"4118: '###'\n", | |
"9998: '-----------'\n", | |
"11311: '---------------'\n", | |
"34194: ';&#'\n", | |
"2518: '----------------------------------------------------------------'\n", | |
"15879: '--------------------'\n", | |
"22866: ';}'\n", | |
"40904: '--------------------------------------------'\n", | |
"30282: '---------------------------------'\n", | |
"315: '----'\n", | |
"24702: '-------------------------'\n", | |
"20744: ';\\\\;\\\\'\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"voc_je_v3 = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v3').get_vocab()\n", | |
"\n", | |
"for k, index in voc_je_v3.items():\n", | |
" if is_comment(k):\n", | |
" print(f\"{index}: {repr(k)}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "u-EMCfnaAQtx", | |
"outputId": "1afcfdec-cba0-46a2-cec4-8a6c1e3bb8fb" | |
}, | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"187284: '###'\n", | |
"157724: ';;;'\n", | |
"55479: '---'\n", | |
"106115: '----'\n", | |
"152745: '-->'\n", | |
"223009: '////'\n", | |
"110405: '------'\n", | |
"103428: '----------------'\n", | |
"195626: '----------'\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment