Skip to content

Instantly share code, notes, and snippets.

@nan-wang
Created January 7, 2025 04:32
Show Gist options
  • Save nan-wang/6a7ed3881c8941e82f93153278e46225 to your computer and use it in GitHub Desktop.
Save nan-wang/6a7ed3881c8941e82f93153278e46225 to your computer and use it in GitHub Desktop.
inspect_into_modernbert_tokenizer.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPdSUmlYiedNetqIv6cvAXY",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/nan-wang/6a7ed3881c8941e82f93153278e46225/inspect_into_modernbert_tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iCAXSlNU9bPo",
"outputId": "1a930d88-794c-46f5-83f9-111107921e9a"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting git+https://github.com/huggingface/transformers.git\n",
" Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-1r4_1umv\n",
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-1r4_1umv\n",
" Resolved https://github.com/huggingface/transformers.git to commit 12ba96aa3cb3e4ed2a3ffb77b59f53f8ce9ac1fa\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (3.16.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.27.0)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (1.26.4)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2024.11.6)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2.32.3)\n",
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.21.0)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.4.5)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (4.67.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (2024.10.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (4.12.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.4.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2024.12.14)\n",
"Building wheels for collected packages: transformers\n",
" Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10291798 sha256=45952e471b259c2bc576e1c85d7bcffe065daea1b9586868e0bd1d6fd000b533\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-x7ro2qtm/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16\n",
"Successfully built transformers\n",
"Installing collected packages: transformers\n",
" Attempting uninstall: transformers\n",
" Found existing installation: transformers 4.47.1\n",
" Uninstalling transformers-4.47.1:\n",
" Successfully uninstalled transformers-4.47.1\n",
"Successfully installed transformers-4.48.0.dev0\n"
]
}
],
"source": [
"!pip install git+https://github.com/huggingface/transformers.git"
]
},
{
"cell_type": "code",
"source": [
"import re\n",
"from transformers import AutoTokenizer\n"
],
"metadata": {
"id": "WboGIDSrAiGW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"patterns = [\n",
" # Single-line comments (must have content after the symbol)\n",
" re.compile(r'^\\s*(#\\S.*|//\\S.*|--\\S.*|;\\S.*)'), # Python, C/C++, Java, JavaScript, SQL, Matlab\n",
" # Multi-line comments (must have content inside the comment block)\n",
" re.compile(r'^\\s*(/\\*.*\\*/|\\'\\'\\'[^\\'\\r\\n]*\\'\\'\\'|\\\"\\\"\\\"[^\\\"\\\\r\\\\n]*\\\"\\\"\\\")'), # C/C++, Java, JavaScript, Go, Rust, Python\n",
"]\n",
"\n",
"def is_comment(line):\n",
" # Check each compiled pattern for whether it matches the line\n",
" for pattern in patterns:\n",
" if pattern.match(line):\n",
" return True\n",
" return False\n"
],
"metadata": {
"id": "xotzCAcL-ttS"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"voc_mbert = AutoTokenizer.from_pretrained(\"answerdotai/ModernBERT-large\").get_vocab()\n",
"\n",
"for k, index in voc_mbert.items():\n",
" if is_comment(k):\n",
" print(f\"{index}: {repr(k)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1D2vvr1aAMRa",
"outputId": "b58a608b-5a00-47c6-c96d-916d6471aa50"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"2912: '------------'\n",
"23796: '-------------------'\n",
"23380: '-----------------'\n",
"42277: '--;'\n",
"9032: '////////////////'\n",
"15623: '---|'\n",
"39485: '-------------------------------------------'\n",
"30624: ';/'\n",
"28693: '------------------------------'\n",
"39481: '//!'\n",
"42040: '----------------------------------------------'\n",
"6154: '------------------------------------------------'\n",
"43500: '-----------------------------------------------'\n",
"10521: '--------------'\n",
"39423: '----------------------------------------------------------------------------------------------------------------'\n",
"573: '----------------'\n",
"1532: '---'\n",
"32107: '-------------------------------'\n",
"26043: '------------------------'\n",
"22873: '---------------------'\n",
"16985: '////////////////////////////////'\n",
"13309: ';&'\n",
"37446: '////////////////////////////////////////////////////////////////'\n",
"36739: ';,'\n",
"29648: '-----------------------------'\n",
"4485: '------'\n",
"44136: '-------------------------------------------------'\n",
"28511: ';\\\\;'\n",
"35000: '-------------------------------------'\n",
"48151: '-------------------------------------------------------'\n",
"397: '--------'\n",
"45599: ';|'\n",
"10326: '#:'\n",
"6846: '-------'\n",
"38944: '-----------------------------------------'\n",
"5146: '######'\n",
"9962: '----------'\n",
"44391: '#{$'\n",
"33585: '-----------------------------------'\n",
"20782: '---|---'\n",
"22928: '------------------'\n",
"34638: '------------------------------------'\n",
"26836: '--\"'\n",
"27396: '#,'\n",
"26577: '#.'\n",
"39421: '------------------------------------------'\n",
"2917: '////'\n",
"36311: '---------------------------------------'\n",
"47726: \"#'\"\n",
"25916: '-----------------------'\n",
"16352: ';\"'\n",
"36960: '----------------------------------------'\n",
"42451: '---------------------------------------------'\n",
"13143: ';</'\n",
"37041: '------------------------------------------------------------------------'\n",
"27800: '--------------------------'\n",
"35349: '--------------------------------------'\n",
"47584: '---------------------------------------------------'\n",
"7405: ';\\\\'\n",
"45263: '--------------------------------------------------'\n",
"47332: '----------------------------------------------------'\n",
"48904: '------------------------------------------------------'\n",
"6675: '########'\n",
"30863: ';\"><'\n",
"23130: '----------------------'\n",
"22158: '-->'\n",
"1835: '####'\n",
"29295: '----------------------------'\n",
"12723: ';;'\n",
"28505: '------------------------------------------------------------------------------------------------'\n",
"22002: '#####'\n",
"43657: ';{\\\\'\n",
"29234: ';_'\n",
"32657: ';;;;'\n",
"16525: '----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'\n",
"10638: '///'\n",
"13011: ';\">'\n",
"817: '##'\n",
"22902: '################################'\n",
"7078: '--------------------------------------------------------------------------------------------------------------------------------'\n",
"50001: '--['\n",
"9794: '---------'\n",
"47632: '---|---|---'\n",
"48924: '//----------------------------------------------------------------'\n",
"26077: ';<'\n",
"11890: '################'\n",
"19628: '--------------------------------------------------------------------------------'\n",
"33301: '--**'\n",
"7040: '-----'\n",
"5071: '////////'\n",
"33250: '----------------------------------'\n",
"43067: '----------------------------------------------------------------------'\n",
"10428: '-------------'\n",
"28388: '---------------------------'\n",
"1013: '--------------------------------'\n",
"4118: '###'\n",
"9998: '-----------'\n",
"11311: '---------------'\n",
"34194: ';&#'\n",
"2518: '----------------------------------------------------------------'\n",
"15879: '--------------------'\n",
"22866: ';}'\n",
"40904: '--------------------------------------------'\n",
"30282: '---------------------------------'\n",
"315: '----'\n",
"24702: '-------------------------'\n",
"20744: ';\\\\;\\\\'\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"voc_je_v3 = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v3').get_vocab()\n",
"\n",
"for k, index in voc_je_v3.items():\n",
" if is_comment(k):\n",
" print(f\"{index}: {repr(k)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "u-EMCfnaAQtx",
"outputId": "1afcfdec-cba0-46a2-cec4-8a6c1e3bb8fb"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"187284: '###'\n",
"157724: ';;;'\n",
"55479: '---'\n",
"106115: '----'\n",
"152745: '-->'\n",
"223009: '////'\n",
"110405: '------'\n",
"103428: '----------------'\n",
"195626: '----------'\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment