lmmx · December 30, 2025 15:17 · lmmx · Dec 30, 2025
diff --git a/demo.py b/demo.py
 from pathlib import Path
 import re

 import polars as pl
 import polars_fastembed  # noqa: F401


 def clean_markdown(text: str) -> str:
    """Strip markdown/HTML artifacts from text."""
    text = re.sub(r'<table>[\s\S]*?</table>', '', text)
    text = re.sub(r'```[\s\S]*?```', '', text)
    text = re.sub(r'`[^`]+`', '', text)
    text = re.sub(r'<!--[\s\S]*?-->', '', text)
    text = re.sub(r'\[TOC\]', '', text)
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    text = re.sub(r'^---[\s\S]*?---', '', text)
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'~~[^~]*~~', '', text)
    text = re.sub(r'^\[.*\]:.*$', '', text, flags=re.MULTILINE)
    return text


 def extract_sentences(text: str, min_words: int = 5) -> list[str]:
    """Split text into sentences, filtering short ones."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if len(s.split()) > min_words]


 def print_axis(result: pl.DataFrame, axis_idx: int, n: int = 8) -> None:
    """Print the positive and negative poles of a semantic axis."""
    sorted_df = result.sort(pl.col("topic_weights").list.get(axis_idx))
    
    total = len(sorted_df)
    neg_rows = list(sorted_df.head(n).iter_rows(named=True))
    pos_rows = list(sorted_df.tail(n).reverse().iter_rows(named=True))
    
    print(f"\n{'='*80}")
    print(f"AXIS {axis_idx}")
    print('='*80)
    
    print(f"\n  ◀ NEGATIVE POLE (sentences 1-{n} of {total})")
    print("  " + "-"*76)
    for i, row in enumerate(neg_rows, 1):
        weight = row["topic_weights"][axis_idx]
        text = row["text"][:90].replace('\n', ' ')
        if len(row["text"]) > 90:
            text += "..."
        print(f"  {i:2}. [{weight:+.2f}] {text}")
    
    print(f"\n  ▶ POSITIVE POLE (sentences {total-n+1}-{total} of {total})")
    print("  " + "-"*76)
    for i, row in enumerate(pos_rows, 1):
        weight = row["topic_weights"][axis_idx]
        text = row["text"][:90].replace('\n', ' ')
        if len(row["text"]) > 90:
            text += "..."
        print(f"  {i:2}. [{weight:+.2f}] {text}")


 def main():
    text = Path("inverse-problems.md").read_text()
    text = clean_markdown(text)
    sentences = extract_sentences(text)
    
    print(f"Extracted {len(sentences)} sentences\n")
    
    model_id = "SnowflakeArcticEmbedXS"
    df = pl.DataFrame({"text": sentences}).fastembed.embed("text", model_name=model_id)
    
    n_components = 3
    result = df.fastembed.s3_topics(embedding_column="embedding", n_components=n_components)
    
    print("S³ SEMANTIC SIGNAL SEPARATION")
    print("="*80)
    print("Each axis is a bipolar semantic dimension discovered via ICA.")
    print("If the method works, sentences at each pole should be thematically coherent.")
    print("Judge for yourself.\n")
    
    for axis_idx in range(n_components):
        print_axis(result, axis_idx, n=8)
    
    print("\n")


 if __name__ == "__main__":
    main()
diff --git a/output.txt b/output.txt
 (2025-in-review) louis 🌟 ~/spin/ss/cog/src/posts/2025-in-review $ qp demo.py 
 Extracted 126 sentences

 S³ SEMANTIC SIGNAL SEPARATION
 ================================================================================
 Each axis is a bipolar semantic dimension discovered via ICA.
 If the method works, sentences at each pole should be thematically coherent.
 Judge for yourself.


 ================================================================================
 AXIS 0
 ================================================================================

  ◀ NEGATIVE POLE (sentences 1-8 of 126)
  ----------------------------------------------------------------------------
   1. [-5.42] Performance and contribution  polars-fastembed wraps fastembed-rs, a Rust port of the Pyth...
   2. [-3.95] I tried some decomposition with FastICA based on a literature review of papers like FASTop...
   3. [-3.24] The convergence step counts don't match between picard-ica and original Python PICARD beca...
   4. [-3.15] Since picard-ica is a Rust crate I just dropped it into polars-fastembed and it now underl...
   5. [-3.02] genson-rs couldn't do this (though it did dtype inference perfectly well, no need to reinv...
   6. [-2.92] I tried a few Rust linear algebra backends along the way with picard-ica and the Rust Fast...
   7. [-2.81] I ported it from the Python original because I needed it fast and without crossing the Rus...
   8. [-2.79] In all I made three intertwined packages: genson-core, avrotize-rs, and polars-genson.

  ▶ POSITIVE POLE (sentences 119-126 of 126)
  ----------------------------------------------------------------------------
   1. [+3.56] Since a Map is just an array in disguise, it can be of any length, as long as its values (...
   2. [+2.81] An object must have a particular length, and all its keys known.
   3. [+2.79] The difference between a Map type and an object comes down to whether or not the key shoul...
   4. [+2.74] The nice thing about the Map type is that it's allowed to be of any length!
   5. [+2.63] A Map type is actually an array type, but it looks like an object type.
   6. [+2.63] If you recognise it as a Map rather than an object you preserve the structure without this...
   7. [+2.47] Note that the 'key' is always a string type in a Map.
   8. [+2.22] A page is a surface, it bends according to material properties.

 ================================================================================
 AXIS 1
 ================================================================================

  ◀ NEGATIVE POLE (sentences 1-8 of 126)
  ----------------------------------------------------------------------------
   1. [-3.82] A field like  from JSON like  is really a key-value pair whose value is a scalar string fi...
   2. [-3.58] In my case though all I wanted was its JSON schema format (confusingly there is "Avro sche...
   3. [-3.29] JSON Schema is a good interchange format but Avro is often used by data engineers (it's ty...
   4. [-3.16] The difference between a Map type and an object comes down to whether or not the key shoul...
   5. [-3.12] genson-core is the first library I know of that can infer Map types from JSON.
   6. [-3.11] Schema inference as constraint discovery  genson-core infers the schema from JSON (fields,...
   7. [-2.89] A Map type is actually an array type, but it looks like an object type.
   8. [-2.89] From JSON Schema to Avro  avrotize-rs handles schema translation, and was a port of the av...

  ▶ POSITIVE POLE (sentences 119-126 of 126)
  ----------------------------------------------------------------------------
   1. [+2.67] L-BFGS relies on successive gradients to infer local curvature, so imprecise gradients mea...
   2. [+2.47] Physically grounded methods  Around 2020, dewarping research shifted from physically-groun...
   3. [+2.40] Grid-based dewarping methods are always going to be privileged over spline fitting ones wh...
   4. [+2.38] Matt Zucker's original blog post treats a photographed page as a bent surface rather than ...
   5. [+2.24] Train a network to predict a pixel displacement field for crumpled paper, that's what you'...
   6. [+2.13] Solving the non-linear least squares problem  Zucker left a tip in his blog post that this...
   7. [+2.09] SciPy's L-BFGS on the other hand (another gradient-based method) worked but converged slow...
   8. [+2.01] To recap, the program uses a cubic sheet model (meaning it fits a cubic curve), specifical...

 ================================================================================
 AXIS 2
 ================================================================================

  ◀ NEGATIVE POLE (sentences 1-8 of 126)
  ----------------------------------------------------------------------------
   1. [-2.65] With some guidance from LLMs I circled back to try it again, but saw it perform poorly any...
   2. [-2.63] that will have the same reliability.
   3. [-2.58] Since the algo ultimately calls LAPACK, I saw this perf as essentially being bottlenecked ...
   4. [-2.49] If you get the algo wrong and it hits the 200 max iterations rather than 20 or 30 you can'...
   5. [-2.46] If you mess any of it up you simply cannot converge as fast as a correct implementation.
   6. [-2.17] Wikidata indeed had examples of this, and my solution was to 'normalise' them.
   7. [-2.08] Note that without such a routine, you'd still have to deal with these awkward customers, a...
   8. [-2.05] Thus followed much battle testing and perf chasing...

  ▶ POSITIVE POLE (sentences 119-126 of 126)
  ----------------------------------------------------------------------------
   1. [+4.79] - Dewarping page images to recover flattened geometry from photographed books - Schema inf...
   2. [+4.29] Embeddings tell you where something is in semantic space (its "meaning"), and their decomp...
   3. [+3.21] Physically grounded methods  Around 2020, dewarping research shifted from physically-groun...
   4. [+2.61] Decomposition with ICA  Once you have embeddings you typically either want to perform sema...
   5. [+2.55] You fit these splines to detected text contours, then solve for the coefficients that best...
   6. [+2.39] A page is a surface, it bends according to material properties.
   7. [+2.33] A field like  from JSON like  is really a key-value pair whose value is a scalar string fi...
   8. [+2.22] Embeddings as a DataFrame operation  Computing embeddings locally removes API costs and la...
	from pathlib import Path
	import re

	import polars as pl
	import polars_fastembed # noqa: F401


	def clean_markdown(text: str) -> str:
	"""Strip markdown/HTML artifacts from text."""
	text = re.sub(r'<table>[\s\S]*?</table>', '', text)
	text = re.sub(r'```[\s\S]*?```', '', text)
	text = re.sub(r'`[^`]+`', '', text)
	text = re.sub(r'<!--[\s\S]*?-->', '', text)
	text = re.sub(r'\[TOC\]', '', text)
	text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
	text = re.sub(r'^---[\s\S]*?---', '', text)
	text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'~~[^~]*~~', '', text)
	text = re.sub(r'^\[.\]:.$', '', text, flags=re.MULTILINE)
	return text


	def extract_sentences(text: str, min_words: int = 5) -> list[str]:
	"""Split text into sentences, filtering short ones."""
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if len(s.split()) > min_words]


	def print_axis(result: pl.DataFrame, axis_idx: int, n: int = 8) -> None:
	"""Print the positive and negative poles of a semantic axis."""
	sorted_df = result.sort(pl.col("topic_weights").list.get(axis_idx))

	total = len(sorted_df)
	neg_rows = list(sorted_df.head(n).iter_rows(named=True))
	pos_rows = list(sorted_df.tail(n).reverse().iter_rows(named=True))

	print(f"\n{'='*80}")
	print(f"AXIS {axis_idx}")
	print('='*80)

	print(f"\n ◀ NEGATIVE POLE (sentences 1-{n} of {total})")
	print(" " + "-"*76)
	for i, row in enumerate(neg_rows, 1):
	weight = row["topic_weights"][axis_idx]
	text = row["text"][:90].replace('\n', ' ')
	if len(row["text"]) > 90:
	text += "..."
	print(f" {i:2}. [{weight:+.2f}] {text}")

	print(f"\n ▶ POSITIVE POLE (sentences {total-n+1}-{total} of {total})")
	print(" " + "-"*76)
	for i, row in enumerate(pos_rows, 1):
	weight = row["topic_weights"][axis_idx]
	text = row["text"][:90].replace('\n', ' ')
	if len(row["text"]) > 90:
	text += "..."
	print(f" {i:2}. [{weight:+.2f}] {text}")


	def main():
	text = Path("inverse-problems.md").read_text()
	text = clean_markdown(text)
	sentences = extract_sentences(text)

	print(f"Extracted {len(sentences)} sentences\n")

	model_id = "SnowflakeArcticEmbedXS"
	df = pl.DataFrame({"text": sentences}).fastembed.embed("text", model_name=model_id)

	n_components = 3
	result = df.fastembed.s3_topics(embedding_column="embedding", n_components=n_components)

	print("S³ SEMANTIC SIGNAL SEPARATION")
	print("="*80)
	print("Each axis is a bipolar semantic dimension discovered via ICA.")
	print("If the method works, sentences at each pole should be thematically coherent.")
	print("Judge for yourself.\n")

	for axis_idx in range(n_components):
	print_axis(result, axis_idx, n=8)

	print("\n")


	if __name__ == "__main__":
	main()
	(2025-in-review) louis 🌟 ~/spin/ss/cog/src/posts/2025-in-review $ qp demo.py
	Extracted 126 sentences

	S³ SEMANTIC SIGNAL SEPARATION
	================================================================================
	Each axis is a bipolar semantic dimension discovered via ICA.
	If the method works, sentences at each pole should be thematically coherent.
	Judge for yourself.


	================================================================================
	AXIS 0
	================================================================================

	◀ NEGATIVE POLE (sentences 1-8 of 126)
	----------------------------------------------------------------------------
	1. [-5.42] Performance and contribution polars-fastembed wraps fastembed-rs, a Rust port of the Pyth...
	2. [-3.95] I tried some decomposition with FastICA based on a literature review of papers like FASTop...
	3. [-3.24] The convergence step counts don't match between picard-ica and original Python PICARD beca...
	4. [-3.15] Since picard-ica is a Rust crate I just dropped it into polars-fastembed and it now underl...
	5. [-3.02] genson-rs couldn't do this (though it did dtype inference perfectly well, no need to reinv...
	6. [-2.92] I tried a few Rust linear algebra backends along the way with picard-ica and the Rust Fast...
	7. [-2.81] I ported it from the Python original because I needed it fast and without crossing the Rus...
	8. [-2.79] In all I made three intertwined packages: genson-core, avrotize-rs, and polars-genson.

	▶ POSITIVE POLE (sentences 119-126 of 126)
	----------------------------------------------------------------------------
	1. [+3.56] Since a Map is just an array in disguise, it can be of any length, as long as its values (...
	2. [+2.81] An object must have a particular length, and all its keys known.
	3. [+2.79] The difference between a Map type and an object comes down to whether or not the key shoul...
	4. [+2.74] The nice thing about the Map type is that it's allowed to be of any length!
	5. [+2.63] A Map type is actually an array type, but it looks like an object type.
	6. [+2.63] If you recognise it as a Map rather than an object you preserve the structure without this...
	7. [+2.47] Note that the 'key' is always a string type in a Map.
	8. [+2.22] A page is a surface, it bends according to material properties.

	================================================================================
	AXIS 1
	================================================================================

	◀ NEGATIVE POLE (sentences 1-8 of 126)
	----------------------------------------------------------------------------
	1. [-3.82] A field like from JSON like is really a key-value pair whose value is a scalar string fi...
	2. [-3.58] In my case though all I wanted was its JSON schema format (confusingly there is "Avro sche...
	3. [-3.29] JSON Schema is a good interchange format but Avro is often used by data engineers (it's ty...
	4. [-3.16] The difference between a Map type and an object comes down to whether or not the key shoul...
	5. [-3.12] genson-core is the first library I know of that can infer Map types from JSON.
	6. [-3.11] Schema inference as constraint discovery genson-core infers the schema from JSON (fields,...
	7. [-2.89] A Map type is actually an array type, but it looks like an object type.
	8. [-2.89] From JSON Schema to Avro avrotize-rs handles schema translation, and was a port of the av...

	▶ POSITIVE POLE (sentences 119-126 of 126)
	----------------------------------------------------------------------------
	1. [+2.67] L-BFGS relies on successive gradients to infer local curvature, so imprecise gradients mea...
	2. [+2.47] Physically grounded methods Around 2020, dewarping research shifted from physically-groun...
	3. [+2.40] Grid-based dewarping methods are always going to be privileged over spline fitting ones wh...
	4. [+2.38] Matt Zucker's original blog post treats a photographed page as a bent surface rather than ...
	5. [+2.24] Train a network to predict a pixel displacement field for crumpled paper, that's what you'...
	6. [+2.13] Solving the non-linear least squares problem Zucker left a tip in his blog post that this...
	7. [+2.09] SciPy's L-BFGS on the other hand (another gradient-based method) worked but converged slow...
	8. [+2.01] To recap, the program uses a cubic sheet model (meaning it fits a cubic curve), specifical...

	================================================================================
	AXIS 2
	================================================================================

	◀ NEGATIVE POLE (sentences 1-8 of 126)
	----------------------------------------------------------------------------
	1. [-2.65] With some guidance from LLMs I circled back to try it again, but saw it perform poorly any...
	2. [-2.63] that will have the same reliability.
	3. [-2.58] Since the algo ultimately calls LAPACK, I saw this perf as essentially being bottlenecked ...
	4. [-2.49] If you get the algo wrong and it hits the 200 max iterations rather than 20 or 30 you can'...
	5. [-2.46] If you mess any of it up you simply cannot converge as fast as a correct implementation.
	6. [-2.17] Wikidata indeed had examples of this, and my solution was to 'normalise' them.
	7. [-2.08] Note that without such a routine, you'd still have to deal with these awkward customers, a...
	8. [-2.05] Thus followed much battle testing and perf chasing...

	▶ POSITIVE POLE (sentences 119-126 of 126)
	----------------------------------------------------------------------------
	1. [+4.79] - Dewarping page images to recover flattened geometry from photographed books - Schema inf...
	2. [+4.29] Embeddings tell you where something is in semantic space (its "meaning"), and their decomp...
	3. [+3.21] Physically grounded methods Around 2020, dewarping research shifted from physically-groun...
	4. [+2.61] Decomposition with ICA Once you have embeddings you typically either want to perform sema...
	5. [+2.55] You fit these splines to detected text contours, then solve for the coefficients that best...
	6. [+2.39] A page is a surface, it bends according to material properties.
	7. [+2.33] A field like from JSON like is really a key-value pair whose value is a scalar string fi...
	8. [+2.22] Embeddings as a DataFrame operation Computing embeddings locally removes API costs and la...