Last active
December 30, 2025 15:17
-
-
Save lmmx/ca08884d9690da7ecf4a20107b40002b to your computer and use it in GitHub Desktop.
Demo of S3 semantic axes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pathlib import Path | |
| import re | |
| import polars as pl | |
| import polars_fastembed # noqa: F401 | |
| def clean_markdown(text: str) -> str: | |
| """Strip markdown/HTML artifacts from text.""" | |
| text = re.sub(r'<table>[\s\S]*?</table>', '', text) | |
| text = re.sub(r'```[\s\S]*?```', '', text) | |
| text = re.sub(r'`[^`]+`', '', text) | |
| text = re.sub(r'<!--[\s\S]*?-->', '', text) | |
| text = re.sub(r'\[TOC\]', '', text) | |
| text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) | |
| text = re.sub(r'^---[\s\S]*?---', '', text) | |
| text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) | |
| text = re.sub(r'~~[^~]*~~', '', text) | |
| text = re.sub(r'^\[.*\]:.*$', '', text, flags=re.MULTILINE) | |
| return text | |
| def extract_sentences(text: str, min_words: int = 5) -> list[str]: | |
| """Split text into sentences, filtering short ones.""" | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| return [s.strip() for s in sentences if len(s.split()) > min_words] | |
| def print_axis(result: pl.DataFrame, axis_idx: int, n: int = 8) -> None: | |
| """Print the positive and negative poles of a semantic axis.""" | |
| sorted_df = result.sort(pl.col("topic_weights").list.get(axis_idx)) | |
| total = len(sorted_df) | |
| neg_rows = list(sorted_df.head(n).iter_rows(named=True)) | |
| pos_rows = list(sorted_df.tail(n).reverse().iter_rows(named=True)) | |
| print(f"\n{'='*80}") | |
| print(f"AXIS {axis_idx}") | |
| print('='*80) | |
| print(f"\n ◀ NEGATIVE POLE (sentences 1-{n} of {total})") | |
| print(" " + "-"*76) | |
| for i, row in enumerate(neg_rows, 1): | |
| weight = row["topic_weights"][axis_idx] | |
| text = row["text"][:90].replace('\n', ' ') | |
| if len(row["text"]) > 90: | |
| text += "..." | |
| print(f" {i:2}. [{weight:+.2f}] {text}") | |
| print(f"\n ▶ POSITIVE POLE (sentences {total-n+1}-{total} of {total})") | |
| print(" " + "-"*76) | |
| for i, row in enumerate(pos_rows, 1): | |
| weight = row["topic_weights"][axis_idx] | |
| text = row["text"][:90].replace('\n', ' ') | |
| if len(row["text"]) > 90: | |
| text += "..." | |
| print(f" {i:2}. [{weight:+.2f}] {text}") | |
| def main(): | |
| text = Path("inverse-problems.md").read_text() | |
| text = clean_markdown(text) | |
| sentences = extract_sentences(text) | |
| print(f"Extracted {len(sentences)} sentences\n") | |
| model_id = "SnowflakeArcticEmbedXS" | |
| df = pl.DataFrame({"text": sentences}).fastembed.embed("text", model_name=model_id) | |
| n_components = 3 | |
| result = df.fastembed.s3_topics(embedding_column="embedding", n_components=n_components) | |
| print("S³ SEMANTIC SIGNAL SEPARATION") | |
| print("="*80) | |
| print("Each axis is a bipolar semantic dimension discovered via ICA.") | |
| print("If the method works, sentences at each pole should be thematically coherent.") | |
| print("Judge for yourself.\n") | |
| for axis_idx in range(n_components): | |
| print_axis(result, axis_idx, n=8) | |
| print("\n") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (2025-in-review) louis 🌟 ~/spin/ss/cog/src/posts/2025-in-review $ qp demo.py | |
| Extracted 126 sentences | |
| S³ SEMANTIC SIGNAL SEPARATION | |
| ================================================================================ | |
| Each axis is a bipolar semantic dimension discovered via ICA. | |
| If the method works, sentences at each pole should be thematically coherent. | |
| Judge for yourself. | |
| ================================================================================ | |
| AXIS 0 | |
| ================================================================================ | |
| ◀ NEGATIVE POLE (sentences 1-8 of 126) | |
| ---------------------------------------------------------------------------- | |
| 1. [-5.42] Performance and contribution polars-fastembed wraps fastembed-rs, a Rust port of the Pyth... | |
| 2. [-3.95] I tried some decomposition with FastICA based on a literature review of papers like FASTop... | |
| 3. [-3.24] The convergence step counts don't match between picard-ica and original Python PICARD beca... | |
| 4. [-3.15] Since picard-ica is a Rust crate I just dropped it into polars-fastembed and it now underl... | |
| 5. [-3.02] genson-rs couldn't do this (though it did dtype inference perfectly well, no need to reinv... | |
| 6. [-2.92] I tried a few Rust linear algebra backends along the way with picard-ica and the Rust Fast... | |
| 7. [-2.81] I ported it from the Python original because I needed it fast and without crossing the Rus... | |
| 8. [-2.79] In all I made three intertwined packages: genson-core, avrotize-rs, and polars-genson. | |
| ▶ POSITIVE POLE (sentences 119-126 of 126) | |
| ---------------------------------------------------------------------------- | |
| 1. [+3.56] Since a Map is just an array in disguise, it can be of any length, as long as its values (... | |
| 2. [+2.81] An object must have a particular length, and all its keys known. | |
| 3. [+2.79] The difference between a Map type and an object comes down to whether or not the key shoul... | |
| 4. [+2.74] The nice thing about the Map type is that it's allowed to be of any length! | |
| 5. [+2.63] A Map type is actually an array type, but it looks like an object type. | |
| 6. [+2.63] If you recognise it as a Map rather than an object you preserve the structure without this... | |
| 7. [+2.47] Note that the 'key' is always a string type in a Map. | |
| 8. [+2.22] A page is a surface, it bends according to material properties. | |
| ================================================================================ | |
| AXIS 1 | |
| ================================================================================ | |
| ◀ NEGATIVE POLE (sentences 1-8 of 126) | |
| ---------------------------------------------------------------------------- | |
| 1. [-3.82] A field like from JSON like is really a key-value pair whose value is a scalar string fi... | |
| 2. [-3.58] In my case though all I wanted was its JSON schema format (confusingly there is "Avro sche... | |
| 3. [-3.29] JSON Schema is a good interchange format but Avro is often used by data engineers (it's ty... | |
| 4. [-3.16] The difference between a Map type and an object comes down to whether or not the key shoul... | |
| 5. [-3.12] genson-core is the first library I know of that can infer Map types from JSON. | |
| 6. [-3.11] Schema inference as constraint discovery genson-core infers the schema from JSON (fields,... | |
| 7. [-2.89] A Map type is actually an array type, but it looks like an object type. | |
| 8. [-2.89] From JSON Schema to Avro avrotize-rs handles schema translation, and was a port of the av... | |
| ▶ POSITIVE POLE (sentences 119-126 of 126) | |
| ---------------------------------------------------------------------------- | |
| 1. [+2.67] L-BFGS relies on successive gradients to infer local curvature, so imprecise gradients mea... | |
| 2. [+2.47] Physically grounded methods Around 2020, dewarping research shifted from physically-groun... | |
| 3. [+2.40] Grid-based dewarping methods are always going to be privileged over spline fitting ones wh... | |
| 4. [+2.38] Matt Zucker's original blog post treats a photographed page as a bent surface rather than ... | |
| 5. [+2.24] Train a network to predict a pixel displacement field for crumpled paper, that's what you'... | |
| 6. [+2.13] Solving the non-linear least squares problem Zucker left a tip in his blog post that this... | |
| 7. [+2.09] SciPy's L-BFGS on the other hand (another gradient-based method) worked but converged slow... | |
| 8. [+2.01] To recap, the program uses a cubic sheet model (meaning it fits a cubic curve), specifical... | |
| ================================================================================ | |
| AXIS 2 | |
| ================================================================================ | |
| ◀ NEGATIVE POLE (sentences 1-8 of 126) | |
| ---------------------------------------------------------------------------- | |
| 1. [-2.65] With some guidance from LLMs I circled back to try it again, but saw it perform poorly any... | |
| 2. [-2.63] that will have the same reliability. | |
| 3. [-2.58] Since the algo ultimately calls LAPACK, I saw this perf as essentially being bottlenecked ... | |
| 4. [-2.49] If you get the algo wrong and it hits the 200 max iterations rather than 20 or 30 you can'... | |
| 5. [-2.46] If you mess any of it up you simply cannot converge as fast as a correct implementation. | |
| 6. [-2.17] Wikidata indeed had examples of this, and my solution was to 'normalise' them. | |
| 7. [-2.08] Note that without such a routine, you'd still have to deal with these awkward customers, a... | |
| 8. [-2.05] Thus followed much battle testing and perf chasing... | |
| ▶ POSITIVE POLE (sentences 119-126 of 126) | |
| ---------------------------------------------------------------------------- | |
| 1. [+4.79] - Dewarping page images to recover flattened geometry from photographed books - Schema inf... | |
| 2. [+4.29] Embeddings tell you where something is in semantic space (its "meaning"), and their decomp... | |
| 3. [+3.21] Physically grounded methods Around 2020, dewarping research shifted from physically-groun... | |
| 4. [+2.61] Decomposition with ICA Once you have embeddings you typically either want to perform sema... | |
| 5. [+2.55] You fit these splines to detected text contours, then solve for the coefficients that best... | |
| 6. [+2.39] A page is a surface, it bends according to material properties. | |
| 7. [+2.33] A field like from JSON like is really a key-value pair whose value is a scalar string fi... | |
| 8. [+2.22] Embeddings as a DataFrame operation Computing embeddings locally removes API costs and la... |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Click to show unannotated
Annotated: