clean up, support external tokenizers

This commit is contained in:
Mirek Kratochvil 2025-07-14 10:33:22 +02:00
parent 396e5cff54
commit 79977cdf4b
4 changed files with 95 additions and 42 deletions

89
Main.hs
View file

@ -5,27 +5,36 @@ module Main where
import Control.Exception import Control.Exception
import Control.Monad import Control.Monad
import Data.Bool import Data.Bool
import Data.Char
import Data.Foldable import Data.Foldable
import Data.List import Data.List
import Data.Maybe
import Data.Traversable import Data.Traversable
import Opts import Opts
import System.Environment
import System.Exit import System.Exit
import System.FilePath import System.FilePath
import System.IO import System.IO
import System.IO.Temp import System.IO.Temp
import System.Process import System.Process
import qualified Toks import qualified Toks
import Debug.Trace {-
- interface to other programs
-}
diffProg = fromMaybe "diff" <$> lookupEnv "WERGE_DIFF"
gitProg = fromMaybe "git" <$> lookupEnv "WERGE_GIT"
bracketFile path mode = bracket (openFile path mode) hClose
-- TODO: the diff w
rundiff f1 f2 out = do rundiff f1 f2 out = do
diff <- diffProg
st <- st <-
withFile out WriteMode $ \oh -> bracketFile out WriteMode $ \oh ->
withCreateProcess withCreateProcess
(proc (proc
"diff" -- TODO: from WERGE_DIFF env diff
[ "--text" [ "--text"
, "--new-line-format=+%L" , "--new-line-format=+%L"
, "--old-line-format=-%L" , "--old-line-format=-%L"
@ -33,27 +42,28 @@ rundiff f1 f2 out = do
, f1 , f1
, f2 , f2
]) ])
{std_in = NoStream, std_out = UseHandle oh, std_err = Inherit} $ \_ _ _ -> {std_in = NoStream, std_out = UseHandle oh} $ \_ _ _ -> waitForProcess
waitForProcess
when (st == ExitFailure 2) $ error "diff failed" when (st == ExitFailure 2) $ error "diff failed"
unless (st `elem` [ExitSuccess, ExitFailure 1]) unless (st `elem` [ExitSuccess, ExitFailure 1])
$ error "diff failed for unknown reason (is GNU diffutils installed?)" $ error "diff failed for unknown reason (is GNU diffutils installed?)"
gitRepoRelRoot = do gitRepoRelRoot = do
git <- gitProg
(path, st) <- (path, st) <-
withCreateProcess withCreateProcess
(proc "git" ["rev-parse", "--show-cdup"]) (proc git ["rev-parse", "--show-cdup"])
{std_in = NoStream, std_out = CreatePipe, std_err = Inherit} $ \_ (Just oh) _ p -> {std_in = NoStream, std_out = CreatePipe} $ \_ (Just oh) _ p ->
(,) <$> hGetContents' oh <*> waitForProcess p (,) <$> hGetContents' oh <*> waitForProcess p
unless (st == ExitSuccess) $ error "git failed" unless (st == ExitSuccess) $ error "git failed"
let [p] = lines path let [p] = lines path
pure p pure p
gitUnmerged = do gitUnmerged = do
git <- gitProg
(paths, st) <- (paths, st) <-
withCreateProcess withCreateProcess
(proc "git" ["status", "--porcelain=v1"]) (proc git ["status", "--porcelain=v1"])
{std_in = NoStream, std_out = CreatePipe, std_err = Inherit} $ \_ (Just oh) _ p -> {std_in = NoStream, std_out = CreatePipe} $ \_ (Just oh) _ p ->
(,) (,)
<$> (map (drop 3) . filter ("UU " `isPrefixOf`) . lines <$> (map (drop 3) . filter ("UU " `isPrefixOf`) . lines
<$> hGetContents' oh) <$> hGetContents' oh)
@ -61,11 +71,12 @@ gitUnmerged = do
unless (st == ExitSuccess) $ error "git failed" unless (st == ExitSuccess) $ error "git failed"
pure paths pure paths
gitCheckoutMOY u my old your = do gitCheckoutMOY cfg u my old your = do
git <- gitProg
(paths, st) <- (paths, st) <-
withCreateProcess withCreateProcess
(proc "git" ["ls-files", "--unmerged", "--", u]) (proc git ["ls-files", "--unmerged", "--", u])
{std_in = NoStream, std_out = CreatePipe, std_err = Inherit} $ \_ (Just oh) _ p -> {std_in = NoStream, std_out = CreatePipe} $ \_ (Just oh) _ p ->
(,) (,)
<$> (sortOn snd <$> (sortOn snd
. map ((\[a, b] -> (a, b)) . take 2 . drop 1 . words) . map ((\[a, b] -> (a, b)) . take 2 . drop 1 . words)
@ -77,8 +88,8 @@ gitCheckoutMOY u my old your = do
st <- st <-
withCreateProcess withCreateProcess
(proc "git" ["cat-file", "blob", hash]) (proc "git" ["cat-file", "blob", hash])
{std_in = NoStream, std_out = CreatePipe, std_err = Inherit} $ \_ (Just ho) _ p -> do {std_in = NoStream, std_out = CreatePipe} $ \_ (Just ho) _ p -> do
hGetContents ho >>= writeFile path . Toks.split -- TODO cfg hSplitToFile cfg ho path
waitForProcess p waitForProcess p
unless (st == ExitSuccess) . error unless (st == ExitSuccess) . error
$ "failed checking out " ++ u ++ " from blob " ++ hash $ "failed checking out " ++ u ++ " from blob " ++ hash
@ -88,10 +99,30 @@ gitCheckoutMOY u my old your = do
_ -> error $ "bad data from ls-files for unmerged " ++ u _ -> error $ "bad data from ls-files for unmerged " ++ u
gitAdd path = do gitAdd path = do
traceM $ "adding " ++ path git <- gitProg
st <- rawSystem "git" ["add", "--", path] st <- rawSystem git ["add", "--", path]
unless (st == ExitSuccess) $ error "git-add failed" unless (st == ExitSuccess) $ error "git-add failed"
{-
- configurable splitting
-}
hSplitToFile cfg h path =
case cfgTokenizer cfg of
TokenizeCharCategory -> internal Toks.splitCategory
TokenizeCharCategorySimple -> internal Toks.splitSimple
TokenizeFilter cmd -> do
st <-
bracketFile path WriteMode $ \ho ->
withCreateProcess
(shell cmd) {std_in = UseHandle h, std_out = UseHandle ho} $ \_ _ _ ->
waitForProcess
unless (st == ExitSuccess) $ error "tokenize filter failed"
where
internal s = hGetContents h >>= writeFile path . Toks.toFile . s
{-
- merge algorithms
-}
data Op data Op
= Del = Del
| Keep | Keep
@ -100,10 +131,10 @@ data Op
pdiff path = map go . lines <$> readFile path pdiff path = map go . lines <$> readFile path
where where
go [] = error "empty line from diff"
go ('-':s) = (Del, s) go ('-':s) = (Del, s)
go (' ':s) = (Keep, s) go (' ':s) = (Keep, s)
go ('+':s) = (Add, s) go ('+':s) = (Add, s)
go [] = error "unexpected output from diff"
data Merged data Merged
= Ok [String] = Ok [String]
@ -204,29 +235,31 @@ resolveSpaces _ x = x
merge cfg@Config {..} ms ys = merge cfg@Config {..} ms ys =
regroup regroup
. map (resolve cfg) . map (resolve cfg)
. traceShowId
. regroup . regroup
. bool id (concatMap zeal) cfgZealous . bool id (concatMap zeal) cfgZealous
. expand cfgContext . expand cfgContext
. regroup . regroup
$ align ms ys $ align ms ys
{-
- front-end
-}
format :: Config -> Handle -> [Merged] -> IO Bool format :: Config -> Handle -> [Merged] -> IO Bool
format Config {..} h = go False format Config {..} h = go False
where where
go c [] = pure c go c [] = pure c
go c (Ok x:xs) = do go c (Ok x:xs) = do
hPutStr h (Toks.glueToks x) hPutStr h (Toks.glue x)
go c xs go c xs
go c (Conflict m o y:xs) = do go c (Conflict m o y:xs) = do
hPutStr h hPutStr h
$ mconcat $ mconcat
[ cfgLabelStart [ cfgLabelStart
, Toks.glueToks m , Toks.glue m
, cfgLabelMyOld , cfgLabelMyOld
, Toks.glueToks o , Toks.glue o
, cfgLabelOldYour , cfgLabelOldYour
, Toks.glueToks y , Toks.glue y
, cfgLabelEnd , cfgLabelEnd
] ]
go True xs go True xs
@ -236,7 +269,7 @@ runCmd CmdDiff3 {..} cfg =
let [fMy, fOld, fYour, fdMy, fdYour] = let [fMy, fOld, fYour, fdMy, fdYour] =
map (workdir </>) ["my", "old", "your", "mydiff", "yourdiff"] map (workdir </>) ["my", "old", "your", "mydiff", "yourdiff"]
for_ [(d3my, fMy), (d3old, fOld), (d3your, fYour)] $ \(path, tmp) -> for_ [(d3my, fMy), (d3old, fOld), (d3your, fYour)] $ \(path, tmp) ->
readFile path >>= writeFile tmp . Toks.split -- TODO cfg bracketFile path ReadMode $ \h -> hSplitToFile cfg h tmp
rundiff fOld fMy fdMy rundiff fOld fMy fdMy
rundiff fOld fYour fdYour rundiff fOld fYour fdYour
conflicted <- conflicted <-
@ -255,15 +288,13 @@ runCmd CmdGitMerge {..} cfg = do
withSystemTempDirectory "werge-git" $ \workdir -> do withSystemTempDirectory "werge-git" $ \workdir -> do
let [fMy, fOld, fYour, fdMy, fdYour] = let [fMy, fOld, fYour, fdMy, fdYour] =
map (workdir </>) ["my", "old", "your", "mydiff", "yourdiff"] map (workdir </>) ["my", "old", "your", "mydiff", "yourdiff"]
gitCheckoutMOY u fMy fOld fYour gitCheckoutMOY cfg u fMy fOld fYour
rundiff fOld fMy fdMy rundiff fOld fMy fdMy
rundiff fOld fYour fdYour rundiff fOld fYour fdYour
readFile u >>= writeFile (u ++ ".werge-backup") readFile u >>= writeFile (u ++ ".werge-backup")
conflict <- conflict <-
withFile u WriteMode $ \h -> bracketFile u WriteMode $ \h ->
merge cfg <$> pdiff fdMy <*> pdiff fdYour >>= format cfg h merge cfg <$> pdiff fdMy <*> pdiff fdYour >>= format cfg h
traceShowM conflict
traceShowM gmDoAdd
unless conflict $ when gmDoAdd $ gitAdd u unless conflict $ when gmDoAdd $ gitAdd u
pure conflict pure conflict
if or conflicts if or conflicts

14
Opts.hs
View file

@ -9,32 +9,32 @@ import Options.Applicative
import Paths_werge (version) import Paths_werge (version)
data Tokenizer data Tokenizer
= TokenizerFilter String = TokenizeFilter String
| TokenizeCharClass | TokenizeCharCategory
| TokenizeCharClassSimple | TokenizeCharCategorySimple
deriving (Show) deriving (Show)
tokenizer = tokenizer =
asum asum
[ TokenizerFilter [ TokenizeFilter
<$> strOption <$> strOption
(long "tok-filter" (long "tok-filter"
<> short 'F' <> short 'F'
<> metavar "FILTER" <> metavar "FILTER"
<> help "external program to separate the text to tokens") <> help "external program to separate the text to tokens")
, flag' , flag'
TokenizeCharClassSimple TokenizeCharCategorySimple
(long "simple-tokens" (long "simple-tokens"
<> short 'i' <> short 'i'
<> help <> help
"use wider character class to separate the tokens (results in larger tokens and ignores case)") "use wider character class to separate the tokens (results in larger tokens and ignores case)")
, flag' , flag'
TokenizeCharClass TokenizeCharCategory
(long "full-tokens" (long "full-tokens"
<> short 'I' <> short 'I'
<> help <> help
"separate characters by all known character classes (default)") "separate characters by all known character classes (default)")
, pure TokenizeCharClass , pure TokenizeCharCategory
] ]
data Spaces data Spaces

View file

@ -22,6 +22,11 @@ Better docs is WIP
cabal install cabal install
``` ```
Running of `werge` requires a working installation of `diff` compatible
with the one from [GNU diffutils](https://www.gnu.org/software/diffutils/). You
may set up a path to such `diff` (or a wrapper script) via environment variable
`WERGE_DIFF`.
## Help & features ## Help & features
``` ```

29
Toks.hs
View file

@ -26,11 +26,28 @@ unmarkSpace x = error "unwat"
space ('.':_) = True space ('.':_) = True
space _ = False space _ = False
split = joinSpaces [] = []
unlines joinSpaces (a@('.':as):xs) =
. map (concatMap escape . markSpace) case joinSpaces xs of
. groupBy ((==) `on` generalCategory) (('.':bs):xs') -> ('.' : (as ++ bs)) : xs'
xs' -> a : xs'
joinSpaces (x:xs) = x : joinSpaces xs
glueToks = concatMap (unmarkSpace . unescape) splitCategory = make . groupBy ((==) `on` generalCategory)
glue = glueToks . lines simpleCategory c
| isSpace c = 0
| isAlpha c = 1
| isNumber c = 2
| otherwise = 3
splitSimple = make . groupBy ((==) `on` simpleCategory)
make = map (concatMap escape . markSpace)
glue :: [String] -> String
glue = concatMap (unmarkSpace . unescape)
fromFile = lines
toFile = unlines