export search data, add indexing script

This commit is contained in:
Mirek Kratochvil 2023-06-17 20:40:11 +02:00
parent 0f97b7a64f
commit 32a49d3179
4 changed files with 76 additions and 0 deletions

View file

@ -21,3 +21,7 @@ htmlWriteOpts =
, writerHighlightStyle = Just pygments
, writerWrapText = WrapPreserve
}
-- | Default plaintext writing options for Pandoc.
plainWriteOpts :: WriterOptions
plainWriteOpts = def {writerWrapText = WrapNone}

View file

@ -44,6 +44,7 @@ data SiteState =
-- the template directory)
, _templates :: M.Map FilePath Mu.Template
, _outputDir :: FilePath -- ^ Directory for output
, _searchDataOut :: Maybe FilePath -- ^ File to write the searchable versions of pages to (as JSON)
, _assetDir :: FilePath -- ^ Directory for output
, _sourceDirs :: [FilePath] -- ^ Path to page source data
, _templateDir :: FilePath -- ^ Path to template directory
@ -68,6 +69,12 @@ siteOptions' = do
long "output" <>
short 'd' <>
help "Directory to render the site to" <> value "_site" <> showDefault
_searchDataOut <-
Just <$>
(strOption $
long "search-data-output" <>
help "Output JSON with searchable page data to this file") <|>
pure Nothing
_assetDir <-
strOption $
long "assets" <>

39
make-search-index.js Normal file
View file

@ -0,0 +1,39 @@
/*
* make-search-index.js
*
* This converts a "search data" file produced by the haskell site builder into
* a lunr.js index and saves it in JSON. Metadata for search (currently titles)
* are stored separately in an extra file.
*
* Installing dependencies:
* yarnpkg add lunr
*
* Usage:
* site ....some args.... --search-data-output search-raw.json
* node make-search-index.js search-raw.json search-index.json search-meta.json
*/
lunr = require("lunr")
fs = require("fs")
if(process.argv.length !== 5) {
console.error('Needs exactly 3 arguments (input json, output index).');
process.exit(1);
}
documents = JSON.parse(fs.readFileSync(process.argv[2], {encoding: 'utf8'}))
var idx = lunr(function () {
this.ref('link')
this.field('title', {boost: 10})
this.field('text')
documents.forEach(function (doc) {
this.add(doc)
}, this)
})
fs.writeFileSync(process.argv[3], JSON.stringify(idx), {encoding: 'utf8'})
fs.writeFileSync(process.argv[4], JSON.stringify(
Object.fromEntries(documents.map(x => [x.link, x.title]))
), {encoding: 'utf8'})

26
site.hs
View file

@ -6,6 +6,7 @@ module Main where
import Control.Monad ((>=>), unless, when)
import Control.Monad.Extra (whenM)
import Control.Monad.Trans.State.Lazy
import qualified Data.Aeson as AE
import qualified Data.Aeson.KeyMap as KM
import qualified Data.ByteString.Lazy as B
import Data.Digest.Pure.SHA (sha256, showDigest)
@ -35,6 +36,7 @@ import System.FilePath
import qualified Text.Mustache as Mu
import Text.Pandoc.Class (runIOorExplode)
import Text.Pandoc.Readers.Markdown (readMarkdown)
import Text.Pandoc.Writers (writePlain)
import Text.Pandoc.Writers.HTML (writeHtml5String)
import qualified Text.Parsec.Error
@ -343,6 +345,29 @@ renderTag tag = do
renderTags :: Site ()
renderTags = use (htags . to M.keys) >>= traverse_ renderTag
-- | Transform one mounted PageInfo to the base search data
mkSearchData :: FilePath -> PageInfo -> Site Y.Value
mkSearchData mount pi = do
text <- io . runIOorExplode $ writePlain plainWriteOpts (pi ^. pageDoc)
let title = pi ^? pageMeta . key "title" . _String
link <- rootUrl mount
pure $
Y.object
[ ("link", fromString link)
, ("title", maybe (fromString mount) Y.String title)
, ("text", Y.String text)
]
-- | Collect all pages' search data to the file
renderSearchData :: Site ()
renderSearchData = use searchDataOut >>= traverse_ go
where
go out = do
ps <- use (pages . to M.assocs) >>= traverse (uncurry mkSearchData)
io $ do
putStrLn $ "S -> " ++ out
AE.encodeFile out $ Y.array ps
-- | Build the whole site.
main = do
init <- Options.Applicative.execParser siteOptions
@ -353,5 +378,6 @@ main = do
use templateDir >>= sourceTemplates
use pages >>= traverse (uncurry installPage) . M.assocs
renderTags
renderSearchData
io $ putStrLn "OK"
whenM (use dumpFinalState) $ get >>= io . print