export search data, add indexing script

2023-06-17 20:40:11 +02:00 · 2023-06-17 20:40:11 +02:00 · 32a49d3179
parent 0f97b7a64f
commit 32a49d3179
4 changed files with 76 additions and 0 deletions
--- a/FormatOpts.hs
+++ b/FormatOpts.hs
@ -21,3 +21,7 @@ htmlWriteOpts =
    , writerHighlightStyle = Just pygments
    , writerWrapText = WrapPreserve
    }
+
+-- | Default plaintext writing options for Pandoc.
+plainWriteOpts :: WriterOptions
+plainWriteOpts = def {writerWrapText = WrapNone}
--- a/Types.hs
+++ b/Types.hs
@ -44,6 +44,7 @@ data SiteState =
    -- the template directory)
    , _templates :: M.Map FilePath Mu.Template
    , _outputDir :: FilePath -- ^ Directory for output
+    , _searchDataOut :: Maybe FilePath -- ^ File to write the searchable versions of pages to (as JSON)
    , _assetDir :: FilePath -- ^ Directory for output
    , _sourceDirs :: [FilePath] -- ^ Path to page source data
    , _templateDir :: FilePath -- ^ Path to template directory
@ -68,6 +69,12 @@ siteOptions' = do
    long "output" <>
    short 'd' <>
    help "Directory to render the site to" <> value "_site" <> showDefault
+  _searchDataOut <-
+    Just <$>
+    (strOption $
+     long "search-data-output" <>
+     help "Output JSON with searchable page data to this file") <|>
+    pure Nothing
  _assetDir <-
    strOption $
    long "assets" <>
--- a/make-search-index.js
+++ b/make-search-index.js
@ -0,0 +1,39 @@
+
+/*
+ * make-search-index.js
+ *
+ * This converts a "search data" file produced by the haskell site builder into
+ * a lunr.js index and saves it in JSON. Metadata for search (currently titles)
+ * are stored separately in an extra file.
+ *
+ * Installing dependencies:
+ * yarnpkg add lunr
+ *
+ * Usage:
+ * site ....some args.... --search-data-output search-raw.json
+ * node make-search-index.js search-raw.json search-index.json search-meta.json
+ */
+
+lunr = require("lunr")
+fs = require("fs")
+
+if(process.argv.length !== 5) {
+  console.error('Needs exactly 3 arguments (input json, output index).');
+  process.exit(1);
+}
+
+documents = JSON.parse(fs.readFileSync(process.argv[2], {encoding: 'utf8'}))
+
+var idx = lunr(function () {
+  this.ref('link')
+  this.field('title', {boost: 10})
+  this.field('text')
+  documents.forEach(function (doc) {
+    this.add(doc)
+  }, this)
+})
+
+fs.writeFileSync(process.argv[3], JSON.stringify(idx), {encoding: 'utf8'})
+fs.writeFileSync(process.argv[4], JSON.stringify(
+    Object.fromEntries(documents.map(x => [x.link, x.title]))
+  ), {encoding: 'utf8'})
--- a/site.hs
+++ b/site.hs
@ -6,6 +6,7 @@ module Main where
 import Control.Monad ((>=>), unless, when)
 import Control.Monad.Extra (whenM)
 import Control.Monad.Trans.State.Lazy
+import qualified Data.Aeson as AE
 import qualified Data.Aeson.KeyMap as KM
 import qualified Data.ByteString.Lazy as B
 import Data.Digest.Pure.SHA (sha256, showDigest)
@ -35,6 +36,7 @@ import System.FilePath
 import qualified Text.Mustache as Mu
 import Text.Pandoc.Class (runIOorExplode)
 import Text.Pandoc.Readers.Markdown (readMarkdown)
+import Text.Pandoc.Writers (writePlain)
 import Text.Pandoc.Writers.HTML (writeHtml5String)
 import qualified Text.Parsec.Error

@ -343,6 +345,29 @@ renderTag tag = do
 renderTags :: Site ()
 renderTags = use (htags . to M.keys) >>= traverse_ renderTag

+-- | Transform one mounted PageInfo to the base search data
+mkSearchData :: FilePath -> PageInfo -> Site Y.Value
+mkSearchData mount pi = do
+  text <- io . runIOorExplode $ writePlain plainWriteOpts (pi ^. pageDoc)
+  let title = pi ^? pageMeta . key "title" . _String
+  link <- rootUrl mount
+  pure $
+    Y.object
+      [ ("link", fromString link)
+      , ("title", maybe (fromString mount) Y.String title)
+      , ("text", Y.String text)
+      ]
+
+-- | Collect all pages' search data to the file
+renderSearchData :: Site ()
+renderSearchData = use searchDataOut >>= traverse_ go
+  where
+    go out = do
+      ps <- use (pages . to M.assocs) >>= traverse (uncurry mkSearchData)
+      io $ do
+        putStrLn $ "S -> " ++ out
+        AE.encodeFile out $ Y.array ps
+
 -- | Build the whole site.
 main = do
  init <- Options.Applicative.execParser siteOptions
@ -353,5 +378,6 @@ main = do
    use templateDir >>= sourceTemplates
    use pages >>= traverse (uncurry installPage) . M.assocs
    renderTags
+    renderSearchData
    io $ putStrLn "OK"
    whenM (use dumpFinalState) $ get >>= io . print