Created
February 9, 2026 00:08
-
-
Save jweinst1/a5aa4453ab73b9499d66d3c1c6eedc71 to your computer and use it in GitHub Desktop.
Hamming distance based LLM without need for training
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <vector> | |
| #include <string> | |
| #include <random> | |
| #include <unordered_map> | |
| #include <arm_neon.h> // Specialized for your M2 chip | |
| // 1024-bit vector = 128 bytes = 2 Cache Lines | |
| // We use alignas(64) to ensure the CPU doesn't cross cache line boundaries | |
| struct alignas(64) HyperVector { | |
| uint64_t bits[16]; | |
| }; | |
| class SemanticEngine { | |
| private: | |
| std::unordered_map<std::string, HyperVector> dna_registry; | |
| public: | |
| // Generates a deterministic random vector for any string | |
| HyperVector get_dna(const std::string& root) { | |
| if (dna_registry.find(root) != dna_registry.end()) return dna_registry[root]; | |
| HyperVector hv; | |
| std::mt19937_64 gen(std::hash<std::string>{}(root)); | |
| std::uniform_int_distribution<uint64_t> dist(0, 0xFFFFFFFFFFFFFFFF); | |
| for (int i = 0; i < 16; ++i) hv.bits[i] = dist(gen); | |
| dna_registry[root] = hv; | |
| return hv; | |
| } | |
| // Bundles multiple concepts into one Document Fingerprint using Bitwise OR | |
| // (Note: In a massive system, we'd use Majority Vote, but OR works for sparse roots) | |
| HyperVector create_fingerprint(const std::vector<std::string>& roots) { | |
| HyperVector doc = {0}; | |
| for (const auto& r : roots) { | |
| HyperVector root_dna = get_dna(r); | |
| for (int i = 0; i < 16; ++i) { | |
| doc.bits[i] |= root_dna.bits[i]; | |
| } | |
| } | |
| return doc; | |
| } | |
| // Optimized Hamming Distance for M2 (ARMv8) | |
| int compare(const HyperVector& a, const HyperVector& b) { | |
| int distance = 0; | |
| for (int i = 0; i < 16; ++i) { | |
| // Builtin_popcountll maps to the 'CNT' instruction on M2 | |
| distance += __builtin_popcountll(a.bits[i] ^ b.bits[i]); | |
| } | |
| return distance; | |
| } | |
| }; | |
| int main() { | |
| SemanticEngine engine; | |
| // 1. Define our concepts | |
| // Document A: Heart Inflammation (Cardio + Itis) | |
| std::vector<std::string> docA_roots = {"cardio", "itis"}; | |
| // Document B: Heart Disease (Cardio + Patho) | |
| std::vector<std::string> docB_roots = {"cardio", "patho"}; | |
| // Document C: Bone Disease (Osteo + Patho) - Unrelated to Heart | |
| std::vector<std::string> docC_roots = {"osteo", "patho"}; | |
| // 2. Generate Fingerprints | |
| auto fingerA = engine.create_fingerprint(docA_roots); | |
| auto fingerB = engine.create_fingerprint(docB_roots); | |
| auto fingerC = engine.create_fingerprint(docC_roots); | |
| // 3. The Query: Find things related to "Heart" (Cardio) | |
| auto query = engine.get_dna("cardio"); | |
| std::cout << "--- 1-Bit Semantic Search Benchmark (M2) ---" << std::endl; | |
| std::cout << "Target Query: 'cardio'" << std::endl; | |
| std::cout << "Distance to 'Cardio-Itis': " << engine.compare(fingerA, query) << std::endl; | |
| std::cout << "Distance to 'Cardio-Patho': " << engine.compare(fingerB, query) << std::endl; | |
| std::cout << "Distance to 'Osteo-Patho': " << engine.compare(fingerC, query) << std::endl; | |
| std::cout << "(Lower score = higher similarity. Random noise is ~512)" << std::endl; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment