Skip to content

Instantly share code, notes, and snippets.

@andrevdm
Created April 20, 2025 16:11

Revisions

  1. andrevdm created this gist Apr 20, 2025.
    438 changes: 438 additions & 0 deletions App.hs
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,438 @@
    {-# LANGUAGE OverloadedStrings #-}
    {-# LANGUAGE NoImplicitPrelude #-}
    {-# LANGUAGE OverloadedRecordDot #-}
    {-# LANGUAGE DuplicateRecordFields #-}
    {-# LANGUAGE QuasiQuotes #-}

    module App where

    import Verset
    import Control.Exception.Safe (throwString, bracket)
    import qualified Database.PostgreSQL.Simple as Pg
    import qualified Data.List.NonEmpty as NE
    import qualified Data.Text as Txt
    import qualified Data.Text.IO as Txt
    import qualified Ollama as O
    import qualified Pgvector as Pgv
    import qualified System.Console.Haskeline as HL
    import qualified System.Directory as Dir
    import qualified System.FilePath as Path
    import System.FilePath ((</>))
    import Text.Pretty.Simple (pPrint)

    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    -- Model selection
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    summariseInputModel :: Text
    summariseInputModel = "gemma3"


    summariseQueryModel :: Text
    --summariseQueryModel = "gemma3:12b"
    summariseQueryModel = "gemma3"


    -- Nomic is faster, use bge-m3 if you need better quality, but remember to change the vector size in the DB (see setup.sql, and recreate the DB)
    embedModel :: Text
    embedModel = "nomic-embed-text:latest"
    --embedModel = "bge-m3:latest"


    replModel :: Text
    --replModel = "gemma3:12b"
    replModel = "gemma3"
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



    data ReplState = ReplState
    { rsVerbose :: !Bool
    , rsHistory :: ![O.Message]
    , rsReplPromptMsg :: !O.Message
    }

    app :: IO ()
    app = do
    startRepl


    createConn :: IO Pg.Connection
    createConn =
    Pg.connect Pg.defaultConnectInfo
    { Pg.connectHost = "localhost"
    , Pg.connectPort = 5532
    , Pg.connectUser = "demo"
    , Pg.connectPassword = ""
    , Pg.connectDatabase = "demo"
    }


    startRepl :: IO ()
    startRepl = do
    --------------------------------------------------------------------------------------------
    -- Index
    --------------------------------------------------------------------------------------------
    putText "Indexing..."
    connIdx <- createConn

    -- Get all the haskell files in ./_index_root
    -- e.g. `git clone https://github.com/scotty-web/scotty.git ./_index_root/scotty/

    let root = "./_index_root/"
    fs1 <- getFilesRec root
    let
    fs2 = filter (\f -> Path.takeExtension f == ".hs") fs1
    fs3 = Txt.pack <$> fs2
    fs4 = filter (\f -> not $ Txt.isInfixOf ".git" f) fs3
    fs = Txt.drop (length root) <$> fs4


    -- Summarise each file, embed and store in the database
    let num = length fs
    for_ (zip fs [1::Int ..]) $ \(f, ix) -> do
    putText $ "Indexing(" <> show ix <> "/" <> show num <> "): " <> f
    docTxt <- Txt.readFile (root </> Txt.unpack f)
    indexIfNotExists summariseFunctions connIdx f docTxt
    pass

    Pg.close connIdx
    --------------------------------------------------------------------------------------------



    --------------------------------------------------------------------------------------------
    -- REPL
    --------------------------------------------------------------------------------------------
    let replSysMessage = Txt.unlines
    [ "Here is a list of the top level functions in this code, and a short summary of each."
    ]

    let st0 =
    ReplState
    { rsVerbose = False
    , rsReplPromptMsg = O.Message O.User replSysMessage Nothing Nothing
    , rsHistory = []
    }


    putText ""
    putText ""
    putText "Welcome to the REPL. Type :quit to exit."
    putText "Type :verbose to see the full response."
    putText "Type :quiet to hide the full response."
    putText ""

    HL.runInputT
    (HL.Settings
    { HL.historyFile = Just "llm.repl.txt"
    , HL.complete = HL.noCompletion
    , HL.autoAddHistory = True
    }
    )
    (loop st0)
    --------------------------------------------------------------------------------------------

    where
    loop :: ReplState -> HL.InputT IO ()
    loop st1 = do
    minput <- HL.getInputLine ""
    case minput of
    Nothing -> pure ()
    Just ":quit" -> pure ()
    Just ":quiet" -> loop $ st1 { rsVerbose = False }
    Just ":verbose" -> loop $ st1 { rsVerbose = True }
    Just input -> do
    st2 <- liftIO . handleInput st1 $ Txt.pack input
    loop st2



    handleInput :: ReplState -> Text -> IO ReplState
    handleInput st1 input = do
    handleMessage
    st1
    O.Message
    { role = O.User
    , content = input
    , images = Nothing
    , tool_calls = Nothing
    }



    handleMessage :: ReplState -> O.Message -> IO ReplState
    handleMessage st1 inputMsg = do
    -- Summarise the user's input so we can use it to search the vector database
    summarisedInput <- summariseFunctions inputMsg.content
    searchRes1 <- do
    bracket
    createConn
    Pg.close
    (\conn -> do
    -- Search the vector database for the summarised input
    -- Get max 4 results
    search conn summarisedInput 4
    )

    -- Only use the vector search results
    -- You could use the FTS results as well, but in this demo I'm not using them
    -- To use them do: `fst searchRes1 <> snd searchRes1`
    let searchRes2 = fst searchRes1
    let searchRes = searchRes2 <&> \(src, content, _score) -> "In file: " <> src <> "\n" <> content

    -- Create history messages for the search results (RAG)
    let ragMessages = searchRes <&> \content -> O.Message O.System content Nothing Nothing

    -- Create the input messages
    let msgs = [] -- message history
    -- Start with the last 10 messages
    <> (reverse . take 10 . reverse . excludeSystemMsgs . excludeReplPromptMsg $ st1.rsHistory)
    <> [st1.rsReplPromptMsg] -- Keep the repl prompt message
    <> ragMessages
    <> [inputMsg] -- The current user message

    chatRes <- chat msgs

    if st1.rsVerbose
    then do
    putText ""
    putText "=================="
    pPrint chatRes
    else
    putText $ maybe "" O.content chatRes.message

    pure $ st1 { rsHistory = msgs <> maybe [] (:[]) chatRes.message }


    where
    -- Model to use. Install with `ollama pull <model_name>`
    -- Make sure it has the tool capability (use `ollama show <model_name>` to check)
    -- And look at the Berkeley leaderboard to pick a good model for function calls. https://gorilla.cs.berkeley.edu/leaderboard.html
    myCopt = mkCopt replModel

    chat msgs = do
    let chatOpts = myCopt (NE.fromList msgs)
    chatRes' <- O.chat chatOpts

    case chatRes' of
    Left e -> throwString e
    Right r' -> pure r'

    excludeReplPromptMsg msgs = filter (\m -> m.content /= st1.rsReplPromptMsg.content) msgs
    excludeSystemMsgs msgs = filter (\m -> m.role /= O.System) msgs





    mkCopt :: Text -> NE.NonEmpty O.Message -> O.ChatOps
    mkCopt modelName msg =
    -- Chat options, you could use O.defaultChatOptions
    O.ChatOps
    { chatModelName = modelName
    , messages = msg
    , tools = Nothing
    , format = Nothing
    , keepAlive = Nothing
    , hostUrl = Nothing
    , responseTimeOut = Nothing
    , options = Nothing
    , stream = Nothing
    }
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------




    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    -- Summarise
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    summariseFunctions :: Text -> IO Text
    summariseFunctions txt = do
    let prompt = Txt.unlines
    [ "You are a source code summarization expert."
    , "List all the top level functions defined in this code and a short summary of each, no premble, no welcome, no other text, just the name and summary."
    , "Following that add a short summary of the file"
    , "Do not add any additional information or context"
    ]

    let opts =
    O.GenerateOps
    { O.modelName = summariseInputModel
    , O.prompt = txt
    , O.suffix = Nothing
    , O.images = Nothing
    , O.format = Nothing
    , O.system = Just prompt
    , O.template = Nothing
    , O.stream = Nothing
    , O.raw = Nothing
    , O.keepAlive = Nothing
    , O.hostUrl = Nothing
    , O.responseTimeOut = Nothing
    , O.options = Nothing
    }

    O.generate opts >>= \case
    Left e -> throwString e
    Right res -> pure res.response_



    summariseQuery :: Text -> IO Text
    summariseQuery txt = do
    let prompt = Txt.unlines
    [ "You are a source code summarization expert."
    , "The user will provide a question about the code."
    , "Convert the user question into a concise search query that can be used to find relevant text passages. Remove question wording and rephrase as a statement."
    , "Do not add any additional information or context."
    , "Never return anything other than the summary text."
    , "Dont guess function names, just summarise the query"
    ]

    let opts =
    O.GenerateOps
    { O.modelName = summariseQueryModel
    , O.prompt = txt
    , O.suffix = Nothing
    , O.images = Nothing
    , O.format = Nothing
    , O.system = Just prompt
    , O.template = Nothing
    , O.stream = Nothing
    , O.raw = Nothing
    , O.keepAlive = Nothing
    , O.hostUrl = Nothing
    , O.responseTimeOut = Nothing
    , O.options = Nothing
    }

    O.generate opts >>= \case
    Left e -> throwString e
    Right res -> pure res.response_
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------





    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    -- Embedding
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    embedd :: Text -> IO O.EmbeddingResp
    embedd txt = do
    O.embedding embedModel txt >>= \case
    Left e -> throwString e
    Right r' -> pure r'


    reindex :: (Text -> IO Text) -> Pg.Connection -> Text -> Text -> IO ()
    reindex preprocessText conn docId docText = do
    -- Remove all old entries
    _ <- Pg.execute conn "delete from paragraph where document_id = ?" (Pg.Only docId)

    putText $ " Embedding: " <> docId

    summaryText <- preprocessText docText

    -- putText ""
    -- putText ""
    -- putText docId
    -- putText "-----------------------------------------"
    -- putText summaryText
    -- putText "-----------------------------------------"
    -- putText ""
    -- putText ""
    -- putText ""

    embeddedParagraph' <- embedd summaryText
    let embeddedParagraph = embeddedParagraph'.embedding_

    -- Get embedding as Pgvector so it can be inserted
    let vs = Pgv.Vector <$> embeddedParagraph

    -- There might be multiple embeddings
    for_ (zip vs [1::Int ..]) $ \(vec, eidx) -> do
    -- Combined index
    let idx :: Text = show eidx

    Pg.execute conn
    "INSERT INTO paragraph \
    \(document_id, paragraph_index, content, embedding, source) \
    \VALUES (?, ?, ?, ?, ?)"
    (docId, idx, summaryText, vec, docId)


    indexIfNotExists :: (Text -> IO Text) -> Pg.Connection -> Text -> Text -> IO ()
    indexIfNotExists preprocessText conn docId docText = do
    res :: [Pg.Only Text] <- Pg.query conn "SELECT document_id FROM paragraph WHERE document_id = ? limit 1" (Pg.Only docId)
    case res of
    [] -> reindex preprocessText conn docId docText
    _ -> putText $ " Document " <> docId <> " already indexed."
    search :: Pg.Connection -> Text -> Int -> IO ([(Text, Text, Double)], [(Text, Text, Double)])
    search conn txt limit = do
    eres <- O.embedding embedModel txt >>= \case
    Left e -> throwString e
    Right r' -> pure r'
    let emb = take 1 $ eres.embedding_ <&> \v -> Pgv.Vector v
    rs <- for emb $ \vec -> do
    vecRes :: [(Text, Text, Double)] <- Pg.query conn
    "SELECT source, content, \
    \ 1 - (embedding <#> ?) AS similarity \
    \ FROM paragraph \
    \ ORDER BY embedding <#> ? \
    \ LIMIT ?"
    (vec, vec, limit)
    ftsRes :: [(Text, Text, Double)] <- Pg.query conn
    "SELECT source, content, \
    \ ts_rank_cd(content_tsv, plainto_tsquery('english', ?)) AS rank \
    \ FROM paragraph \
    \ WHERE content_tsv @@ plainto_tsquery('english', ?) \
    \ ORDER BY rank DESC \
    \ LIMIT ?"
    (txt, txt, limit)
    pure (vecRes, ftsRes)
    pure (concat $ fst <$> rs, concat $ snd <$> rs)
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    -- File system utils
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    getFilesRec :: FilePath -> IO [FilePath]
    getFilesRec = getFilesRec' (const True)
    getFilesRec' :: (FilePath -> Bool) -> FilePath -> IO [FilePath]
    getFilesRec' continueFn p =
    if continueFn p
    then do
    fs <- getFiles p
    ds <- getDirs p
    cs <- traverse (getFilesRec' continueFn) ds
    pure $ fs <> join cs
    else
    pure []
    getDirs :: FilePath -> IO [FilePath]
    getDirs p = do
    entries <- (p </>) <<$>> Dir.listDirectory p
    filterM Dir.doesDirectoryExist entries
    getFiles :: FilePath -> IO [FilePath]
    getFiles p = do
    entries <- (p </>) <<$>> Dir.listDirectory p
    filterM Dir.doesFileExist entries
    --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    40 changes: 40 additions & 0 deletions setup.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,40 @@
    -- NB as postgres user

    create database demo;
    CREATE USER demo;
    GRANT ALL PRIVILEGES ON DATABASE demo TO demo;
    \connect demo;
    ALTER SCHEMA public OWNER TO demo;
    SET search_path = public;


    CREATE EXTENSION IF NOT EXISTS vector;
    CREATE EXTENSION IF NOT EXISTS pg_trgm;
    CREATE EXTENSION IF NOT EXISTS "uuid-ossp";



    -- NB change to demo user


    CREATE TABLE paragraph (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id TEXT NOT NULL,
    paragraph_index TEXT NOT NULL,
    content TEXT NOT NULL,
    embedding VECTOR(768) NOT NULL, -- 1024 for bge-m3:latest, 768 for nomic-embed-text:latest
    source TEXT,
    inserted_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    content_tsv tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED
    );

    -- Indexes
    CREATE INDEX ON paragraph
    USING ivfflat (embedding vector_cosine_ops)
    WITH (lists = 100);

    CREATE INDEX ON paragraph
    USING GIN (content_tsv);

    CREATE INDEX ON paragraph
    USING GIN (content gin_trgm_ops); -- Optional, for fuzzy searches