From 6e2ab88961381ab0e53fccb1be13df3eacf0446d Mon Sep 17 00:00:00 2001 From: Mirek Kratochvil Date: Wed, 25 May 2022 08:30:09 +0200 Subject: [PATCH] a bit of a revamp --- README.md | 26 +++--- adiff.cabal | 11 +-- lexers/letters | 2 - lexers/lines | 2 - lexers/nums | 2 - lexers/words | 2 - src/Diff.hs | 28 ++++--- src/Format.hs | 99 ++++++++++++++++++++--- src/Main.hs | 200 +++++++++++++++++++++++++++++++++++----------- src/Merge.hs | 8 +- src/Patch.hs | 32 +++++++- src/Redfa.hs | 177 ---------------------------------------- src/Substr.hs | 7 -- src/Tokenizers.hs | 114 ++++++++++++++++++++++++++ src/Types.hs | 2 + 15 files changed, 423 insertions(+), 289 deletions(-) delete mode 100644 lexers/letters delete mode 100644 lexers/lines delete mode 100644 lexers/nums delete mode 100644 lexers/words delete mode 100644 src/Redfa.hs delete mode 100644 src/Substr.hs create mode 100644 src/Tokenizers.hs diff --git a/README.md b/README.md index 16ee2ae..bc95287 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,14 @@ standard unix `diff` and related tools. The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who produced a working software package in C++, and designed the Regex-edged DFAs (REDFAs) that were used for user-specifiable tokenization of the input. The -work on the corresponding thesis is currently ongoing. +work on the corresponding thesis is finished. This started as a simple Haskell port of that work, and packed some relatively -orthogonal improvements (mainly the histogram-style diffing). +orthogonal improvements (mainly the histogram-style diffing). I later got rid +of the REDFA concept -- while super-interesting and useful in theory, I didn't +find a sufficiently universal way to build good lexers from user-specified +strings. Having a proper Regex representation library (so that e.g. +reconstructing Flex is easy) would help a lot. ### TODO list @@ -23,16 +27,12 @@ orthogonal improvements (mainly the histogram-style diffing). of hunk context. `diff` and `diff3` works. - Implement a splitting heuristic for diffs, so that diffing of large files doesn't take aeons -- Check whether REDFA can even be implemented correctly with current Haskell - libraries (most regex libraries target a completely different). Taking the - lexer specification format from `alex` currently seems like a much better - option. Deferring the task unix-ishly to another program could work too. +- check if we can have external lexers, unix-style # How-To Install using `cabal`. The `adiff` program has 3 sub-commands that work like -`diff`, `patch` and `diff3`. It expects a lexing specification on the input; -there are several very simple example lexers in `lexers/`. +`diff`, `patch` and `diff3`. ## Example @@ -49,15 +49,11 @@ Patching is hard. I still cannot rhyme. ``` Let's use the `words` lexer, which marks everything whitespace-ish as -whitespace, and picks up groups of non-whitespace "content" characters: -``` -:[^ \t\n]* -_:[ \t\n]* -``` +whitespace, and picks up groups of non-whitespace "content" characters. Diffing the 2 files gets done as such: ``` - $ cabal run adiff -- -l lexers/words diff orig mine + $ cabal run adiff -- -l words diff orig mine ``` You should get something like this: @@ -99,7 +95,7 @@ I cannot do verses. We can run `diff3` to get a patch with both changes, optionally with reduced context: ``` - $ cabal run adiff -- -l lexers/words diff3 mine orig yours -C1 + $ cabal run adiff -- -l words diff3 mine orig yours -C1 ``` ...which outputs: ``` diff --git a/adiff.cabal b/adiff.cabal index af860c9..7bbf9ca 100644 --- a/adiff.cabal +++ b/adiff.cabal @@ -51,8 +51,7 @@ executable adiff Hunks, Merge, Patch, - Redfa, - Substr, + Tokenizers, Types, Version @@ -60,13 +59,15 @@ executable adiff other-extensions: CPP -- Other library packages from which modules are imported. - build-depends: base ^>=4.13.0.0, + build-depends: base ^>=4.15.0.0, + attoparsec ^>=0.14, extra ^>= 1.7, mmap ^>=0.5, - regex-tdfa ^>= 1.3, optparse-applicative ^>=0.16, - bytestring ^>= 0.10.12, + bytestring ^>= 0.11.2, vector ^>=0.12, + word8 ^>=0.1, + unicode-data ^>=0.3, utf8-string ^>=1.0 -- Directories containing source files. diff --git a/lexers/letters b/lexers/letters deleted file mode 100644 index c8ca875..0000000 --- a/lexers/letters +++ /dev/null @@ -1,2 +0,0 @@ -[a-z] -_:[ \n] diff --git a/lexers/lines b/lexers/lines deleted file mode 100644 index 5bf4719..0000000 --- a/lexers/lines +++ /dev/null @@ -1,2 +0,0 @@ -[^\n]*\n -[^\n]* diff --git a/lexers/nums b/lexers/nums deleted file mode 100644 index 3e57b7e..0000000 --- a/lexers/nums +++ /dev/null @@ -1,2 +0,0 @@ -[0-9] -_:\n diff --git a/lexers/words b/lexers/words deleted file mode 100644 index a772682..0000000 --- a/lexers/words +++ /dev/null @@ -1,2 +0,0 @@ -:[^ \t\n]* -_:[ \t\n]* diff --git a/src/Diff.hs b/src/Diff.hs index 244aaf2..85e21fb 100644 --- a/src/Diff.hs +++ b/src/Diff.hs @@ -20,9 +20,9 @@ data DiffEnv = , deB :: Int , deVS :: V.Vector (Int, Int) , deVE :: V.Vector (Int, Int) + , deTokPrio :: Tok -> Int , deTrans :: Bool } - deriving (Show) toksMatch :: Int -> Int -> DiffEnv -> Bool toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y @@ -47,10 +47,19 @@ stripEqToks t1 t2 = (pre, post, t1', t2') t1' = V.slice b (l1 - e - b) t1 t2' = V.slice b (l2 - e - b) t2 +makePrios :: TV -> TV -> (Bool, BS) -> Int +makePrios _ _ = get + where + get (isToken, str) = + if isToken + then B.length str + else 0 + diffToks :: TV -> TV -> Diff diffToks t1' t2' = pre ++ res ++ post where (pre, post, t1, t2) = stripEqToks t1' t2' + stats = makePrios t1' t2' res | V.null t1 = map (Add, ) (V.toList t2) | V.null t2 = map (Remove, ) (V.toList t1) @@ -67,6 +76,7 @@ diffToks t1' t2' = pre ++ res ++ post , deB = V.length t2 , deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0) , deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0) + , deTokPrio = stats , deTrans = False } | otherwise = @@ -82,6 +92,7 @@ diffToks t1' t2' = pre ++ res ++ post , deB = V.length t1 , deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0) , deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0) + , deTokPrio = stats , deTrans = True } @@ -110,12 +121,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} = mid = quot (s + e) 2 vecSmid = vecS mid vecEmid = vecE mid - extraScore i = - if isToken - then -(B.length str) - else 0 - where - (isToken, str) = deT1 de V.! i + prio i = negate . deTokPrio de $ deT1 de V.! i vecS = vec -- "forward" operation where vec i @@ -131,7 +137,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} = (iupleft, supleft) = v V.! pred j keep | toksMatch (pred i) (pred j) de = - min (iupleft, supleft + extraScore (pred i)) + min (iupleft, supleft + prio (pred i)) | otherwise = id res = keep $ min (succ iup, sup) (succ ileft, sleft) in res : go (succ j) res @@ -150,7 +156,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} = (idownright, sdownright) = v V.! succ j keep | toksMatch i j de = - min (idownright, sdownright + extraScore i) + min (idownright, sdownright + prio i) | otherwise = id res = keep $ min (succ idown, sdown) (succ iright, sright) in res : go (pred j) res @@ -213,7 +219,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} = , fst (vecLS V.! i) == fst sCost - a + i , sumL V.! i == totalCost , if doKeep - then scoreAdd (vecLS V.! i) (0, extraScore s) == + then scoreAdd (vecLS V.! i) (0, prio s) == vecRS V.! succ i else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i , if doKeep @@ -223,7 +229,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} = ] jumpEnd = if doKeep - then jumpPos + 1 + then succ jumpPos else jumpPos in map (\i -> diff --git a/src/Format.hs b/src/Format.hs index e225464..9ca635f 100644 --- a/src/Format.hs +++ b/src/Format.hs @@ -1,30 +1,39 @@ -{-# LANGUAGE OverloadedStrings #-} - module Format ( pprHunks , pprHunk - , pprDiff1 + , parsePatch ) where import Types +import Control.Applicative +import Control.Monad +import Data.Attoparsec.ByteString as A import qualified Data.ByteString as B import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Lazy as BL import Data.String +import Data.Word8 as W8 -pprHunkHdr :: Int -> Int -> BB.Builder -pprHunkHdr i j = fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@" +backslash :: Word8 +backslash = BI.c2w '\\' -lineSep :: BB.Builder -lineSep = fromString "\n" +newline :: Word8 +newline = BI.c2w '\n' pprHunks :: [Hunk] -> BB.Builder pprHunks = mconcat . map pprHunk +lineSep :: BB.Builder +lineSep = BB.word8 newline + +pprHunkHdr :: Int -> Int -> BB.Builder +pprHunkHdr i j = + (fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@") <> lineSep + pprHunk :: Hunk -> BB.Builder -pprHunk ((i, j), toks) = - mconcat ((pprHunkHdr i j <> lineSep) : map pprDiff1 toks) +pprHunk ((i, j), toks) = mconcat (pprHunkHdr i j : map pprDiff1 toks) pprDiff1 :: (Op, Tok) -> BB.Builder pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep @@ -46,6 +55,74 @@ pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep escNewlines :: BS -> BB.Builder escNewlines s | B.null s = mempty - | B.head s == BI.c2w '\n' = BB.stringUtf8 "\\n" <> escNewlines (B.tail s) - | B.head s == BI.c2w '\\' = BB.stringUtf8 "\\\\" <> escNewlines (B.tail s) + | B.head s == newline = + BB.word8 backslash <> BB.word8 (BI.c2w 'n') <> escNewlines (B.tail s) + | B.head s == backslash = + BB.word8 backslash <> BB.word8 backslash <> escNewlines (B.tail s) | otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s) + +--parsePatch :: BS -> Either String [Hunk] +parsePatch = parseOnly parseHunks + +parseHunks :: Parser [Hunk] +parseHunks = many parseHunk <* endOfInput + +parseHunk :: Parser Hunk +parseHunk = liftA2 (,) parseHunkHdr (many parseDiff1) + +parseInt :: Parser Int +parseInt = read . map BI.w2c <$> some (satisfy W8.isDigit) + +eol :: Parser () +eol = void $ word8 newline + +parseHunkHdr :: Parser (Int, Int) +parseHunkHdr = do + void . string $ fromString "@@ -" + i <- parseInt + void . string $ fromString " +" + j <- parseInt + void . string $ fromString " @@" + eol + return (i, j) + +pairs2parsers :: [(a, Char)] -> [Parser a] +pairs2parsers = map (\(x, ch) -> x <$ word8 (BI.c2w ch)) + +parseOpList :: [Parser Op] +parseOpList = + pairs2parsers + [ (Add, '+') + , (Keep, ' ') + , (Remove, '-') + , (MineChanged, '<') + , (Original, '=') + , (YourChanged, '>') + ] + +parseOp :: Parser Op +parseOp = choice parseOpList + +parseTokMarkList :: [Parser Bool] +parseTokMarkList = pairs2parsers [(True, '|'), (False, '.')] + +parseTokMark :: Parser Bool +parseTokMark = choice parseTokMarkList + +parseTokBS :: Parser BS +parseTokBS = + (BL.toStrict . BB.toLazyByteString . mconcat <$> many parseTokChar) <* eol + +parseTokChar :: Parser BB.Builder +parseTokChar = + choice + [ BB.word8 newline <$ string (fromString "\\n") + , BB.word8 backslash <$ string (fromString "\\\\") + , BB.word8 <$> satisfy (\w -> w /= backslash && w /= newline) + ] + +parseTok :: Parser Tok +parseTok = liftA2 (,) parseTokMark parseTokBS + +parseDiff1 :: Parser (Op, Tok) +parseDiff1 = liftA2 (,) parseOp parseTok diff --git a/src/Main.hs b/src/Main.hs index 640ad87..30d3551 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,6 +1,10 @@ module Main where +import Control.Exception +import Control.Monad +import qualified Data.ByteString as B import qualified Data.ByteString.Builder as BB +import Data.Foldable (traverse_) import qualified Data.Vector as V import Diff import Diff3 @@ -8,15 +12,17 @@ import Format import Hunks import Merge import Options.Applicative -import Redfa -import System.IO (stdout) +import Patch +import System.Exit +import System.IO (hPutStrLn, stderr, stdout) import System.IO.MMap +import Tokenizers import Types import Version data ADiffOptions = ADiffOptions - { adiffRedfaOpt :: RedfaOption + { adiffTokOpts :: TokOpts , adiffCmdOpts :: ADiffCommandOpts } deriving (Show) @@ -29,11 +35,16 @@ data ADiffCommandOpts } | CmdPatch { patchDryRun :: Bool - , patchInDir :: Maybe String - , patchInput :: String + --, patchInDir :: Maybe String + --, patchPathStrip :: Int + , patchInputPatch :: String + , patchOutput :: String + , patchReject :: String --todo convert to Maybes with optional + , patchBackup :: String , patchReverse :: Bool - , patchPathStrip :: Int + , context :: Int , patchMergeOpts :: MergeOpts + , patchInput :: String } | CmdDiff3 { context :: Int @@ -44,15 +55,22 @@ data ADiffCommandOpts } deriving (Show) -contextOpt :: Parser Int -contextOpt = +contextOpt :: Bool -> Parser Int +contextOpt forPatch = check <$> option auto (metavar "CONTEXT" <> short 'C' <> long "context" <> - value 5 <> help "How many tokens around the change to include in the patch") + value + (if forPatch + then 4 + else 8) <> + help + (if forPatch + then "Minimum amount of context tokens that must match so that the hunk is applied" + else "How many tokens around the change to include in the patch")) where check c | c < 0 = error "Negative context" @@ -60,7 +78,7 @@ contextOpt = diffCmdOptions :: Parser ADiffCommandOpts diffCmdOptions = - CmdDiff <$> contextOpt <*> strArgument (metavar "FROMFILE") <*> + CmdDiff <$> contextOpt False <*> strArgument (metavar "FROMFILE") <*> strArgument (metavar "TOFILE") patchCmdOptions :: Parser ADiffCommandOpts @@ -70,33 +88,49 @@ patchCmdOptions = (short 'n' <> long "dry-run" <> help "Do not patch anything, just print what would happen") <*> - optional - (strOption $ - short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*> + -- optional (strOption $ short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*> + --option auto (short 'p' <> long "strip" <> metavar "NUM" <> help "Strip NUM leading components from the paths" <> value 0) <*> strOption (short 'i' <> long "input" <> - metavar "INPUT" <> - help "Read the patchfile from INPUT, defaults to `-' for STDIN" <> + metavar "PATCHFILE" <> + help "Read the patchfile from PATCHFILE, defaults to `-' for STDIN" <> value "-") <*> + strOption + (short 'o' <> + long "output" <> + metavar "OUTPUT" <> + help + "Write the patched file to OUTPUT, use `-' for STDOUT. By default, INPUT is rewritten." <> + value "") <*> + strOption + (short 'r' <> + long "reject" <> + metavar "REJECTS" <> + help + "Write the rejected hunks file to file REJECTS, instead of default `OUTPUT.rej'. Use `-' to discard rejects." <> + value "") <*> + strOption + (short 'b' <> + long "backup" <> + metavar "BACKUP" <> + help + "When rewriting INPUT after a partially applied or otherwise suspicious patch, back up the original file in BACKUP instead of default `INPUT.orig'. Use `-' to discard backups." <> + value "") <*> switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*> - option - auto - (short 'p' <> - long "strip" <> - metavar "NUM" <> - help "Strip NUM leading components from the paths" <> value 0) <*> - mergeOption True + contextOpt True <*> + mergeOption True <*> + strArgument (metavar "INPUT") diff3CmdOptions :: Parser ADiffCommandOpts diff3CmdOptions = - CmdDiff3 <$> contextOpt <*> strArgument (metavar "MYFILE") <*> + CmdDiff3 <$> contextOpt False <*> strArgument (metavar "MYFILE") <*> strArgument (metavar "OLDFILE") <*> strArgument (metavar "YOURFILE") <*> mergeOption False -actionOption :: Parser ADiffCommandOpts -actionOption = +actionOptions :: Parser ADiffCommandOpts +actionOptions = hsubparser $ mconcat [ command "diff" $ info diffCmdOptions $ progDesc "Compare two files" @@ -106,36 +140,106 @@ actionOption = ] adiffOptions :: Parser ADiffOptions -adiffOptions = ADiffOptions <$> redfaOption <*> actionOption +adiffOptions = ADiffOptions <$> tokOptions <*> actionOptions -loadToks :: RedfaSpec -> FilePath -> IO TV -loadToks redfa f = - V.fromList <$> (mmapFileByteString f Nothing >>= redfaTokenize redfa) +-- TODO: load in case it's not a regular file +loadToks :: TokOpts -> FilePath -> IO TV +loadToks topt f = + V.fromList <$> (mmapFileByteString f Nothing >>= tokenize topt f) -main :: IO () -main = +doDiff :: TokOpts -> ADiffCommandOpts -> IO () +doDiff topt (CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt}) = do + [toks1, toks2] <- traverse (loadToks topt) [f1, f2] + let output = hunks (max 0 ctxt) $ diffToks toks1 toks2 + BB.hPutBuilder stdout $ pprHunks output + unless (null output) $ exitWith (ExitFailure 1) +doDiff _ _ = error "dispatch failure" + +doDiff3 :: TokOpts -> ADiffCommandOpts -> IO () +doDiff3 topt (CmdDiff3 ctxt f1 f2 f3 mo) = do + [toksMine, toksOld, toksYour] <- traverse (loadToks topt) [f1, f2, f3] + let d3 = diff3Toks mo toksMine toksOld toksYour + isConflict (MineChanged, _) = True + isConflict (YourChanged, _) = True + isConflict _ = False + hasConflict = any isConflict d3 + BB.hPutBuilder stdout $ + if mergeDoMerge mo + then do + fmtMerged mo d3 + else pprHunks $ hunks (max 0 ctxt) d3 + when hasConflict $ exitWith (ExitFailure 1) +doDiff3 _ _ = error "dispatch failure" + +doPatch :: TokOpts -> ADiffCommandOpts -> IO () +doPatch topt o = do + toksIn <- loadToks topt (patchInput o) + hs' <- + parsePatch <$> + case (patchInputPatch o) of + "-" -> B.getContents + fn -> B.readFile fn + hs <- + case hs' of + Left _ -> ioError $ userError "PATCHFILE parsing failed" + Right x -> pure x + let (toks, rej, warns) = + patchToks toksIn hs (patchReverse o) (patchMergeOpts o) + sus = not (null warns) + rewritingInput = null (patchOutput o) + outputStdout = patchOutput o == "-" + rejfile + | rewritingInput || outputStdout = patchInput o ++ ".rej" + | otherwise = patchOutput o ++ ".rej" + backupfile + | patchBackup o == "-" = "" + | patchBackup o == "" && rewritingInput = patchInput o ++ ".orig" + | otherwise = patchBackup o + traverse_ (hPutStrLn stderr . pprPatchWarn) warns + if patchDryRun o + then do + hPutStrLn stderr $ + (if sus + then "OK" + else "Possibly problematic") ++ + " patch with " ++ show (length rej :: Int) ++ " rejected hunks" + BB.hPutBuilder stdout (pprHunks rej) + else do + when (not $ null rej) $ BB.writeFile rejfile (pprHunks rej) + when (sus && not (null backupfile)) $ + B.readFile (patchInput o) >>= B.writeFile backupfile + ($ mconcat $ map (BB.byteString . snd) $ V.toList toks) $ + if outputStdout + then BB.hPutBuilder stdout + else BB.writeFile + (if rewritingInput + then patchInput o + else patchOutput o) + when sus $ exitWith (ExitFailure 1) + +main' :: IO () +main' = let opts :: ParserInfo ADiffOptions opts = info (adiffOptions <**> versionOption "adiff" <**> helperOption) (fullDesc <> progDesc - "Compare, patch and merge files on arbitrarily-tokenized sequences." <> + "Compare, patch and merge files on arbitrarily tokenized sequences." <> header "adiff: arbitrary-token diff utilities") - in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <- + in do ADiffOptions {adiffTokOpts = topt, adiffCmdOpts = copt} <- customExecParser (prefs $ helpShowGlobals <> subparserInline) opts - redfa <- redfaPrepareRules ropt - case copt of - CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do - [toks1, toks2] <- traverse (loadToks redfa) [f1, f2] - BB.hPutBuilder stdout $ - pprHunks $ hunks (max 0 ctxt) $ diffToks toks1 toks2 - CmdPatch {} -> putStrLn "not supported yet" - CmdDiff3 ctxt f1 f2 f3 mo -> do - [toksMine, toksOld, toksYour] <- - traverse (loadToks redfa) [f1, f2, f3] - let d3 = diff3Toks mo toksMine toksOld toksYour - BB.hPutBuilder stdout $ - if mergeDoMerge mo - then fmtMerged mo d3 - else pprHunks $ hunks (max 0 ctxt) d3 + (case copt of + CmdDiff {} -> doDiff + CmdDiff3 {} -> doDiff3 + CmdPatch {} -> doPatch) + topt + copt + +main :: IO () +main = + main' `catch` + (\e -> do + let err = show (e :: IOException) + hPutStrLn stderr err + exitWith $ ExitFailure 2) diff --git a/src/Merge.hs b/src/Merge.hs index 0cfe750..b56a70d 100644 --- a/src/Merge.hs +++ b/src/Merge.hs @@ -33,7 +33,7 @@ mergeOption forPatch = switch (short 'a' <> long "add-linebreak" <> - help "Automatically add a line break after conflict markers") <*> + help "Automatically add a line break after conflict markers. Useful with `lines' lexer.") <*> mo) where marker = fromString . replicate 7 @@ -44,7 +44,7 @@ mergeOption forPatch = long "merge" <> help (if forPatch - then "Merge using conflict markers instead of printing the rejected hunks" + then "Instead of printing the rejected thunks, merge using conflict markers as if the INPUT was `MYFILE' and the patch would produced `YOURFILE' from the original." else "Output the merged file instead of the patch")) <*> switch (short 'i' <> @@ -69,11 +69,11 @@ mergeOption forPatch = help ("On whitespace mismatch, output the version from " ++ (if forPatch - then "original file" + then "the original file" else "MYFILE") ++ " instead of the one from " ++ (if forPatch - then "patch" + then "the context in patch" else "YOURFILE"))) <*> strOption (long "merge-start" <> diff --git a/src/Patch.hs b/src/Patch.hs index 08b324f..b1b62e2 100644 --- a/src/Patch.hs +++ b/src/Patch.hs @@ -1,4 +1,30 @@ -module Patch where +module Patch (patchToks, pprPatchWarn) where -patchToks :: a -patchToks = undefined +import Format +import Types +import Merge +import qualified Data.ByteString as B + +data PatchWarn = HunkMatched Hunk Int | HunkFailed Hunk + +pprPatchWarn :: PatchWarn -> String +pprPatchWarn (HunkMatched ((o,n), _) at) = "hunk (-"++ show o ++ " +"++show n++") succeeded at "++show at +pprPatchWarn (HunkFailed ((o,n),_)) = "hunk (-"++ show o ++ " +"++show n++") FAILED" + +patchToks :: TV -> [Hunk] -> Bool -> MergeOpts -> (TV, [Hunk], [PatchWarn]) +patchToks toks hunks rev mopt = undefined + --let (tokss, hunk, sus) = patchHunksAt 0 0 0 (if rev then map revHunk hunks else hunks) mopt + + +patchHunksAt :: TV -> Int -> Int -> Int -> [Hunk] -> MergeOpts -> ([TV], [PatchWarn]) +patchHunksAt toks tvoff origoff newoff hunks mopt = undefined + +hunkToMatch :: Diff -> Diff +hunkToMatch = filter $ (`elem` [Keep, Remove, Original]) . fst + +hunkToReplace :: Diff -> Diff +hunkToReplace = filter $ (`elem` [Keep, Add, MineChanged, YourChanged]) . fst + +tokCmp :: Bool -> Tok -> Tok -> Bool +tokCmp True (False, _) (False, _) = True -- ignore the whitespace change +tokCmp _ a b = a == b -- otherwise just compare diff --git a/src/Redfa.hs b/src/Redfa.hs deleted file mode 100644 index 9967d59..0000000 --- a/src/Redfa.hs +++ /dev/null @@ -1,177 +0,0 @@ -{-# LANGUAGE FlexibleContexts #-} - -module Redfa where - -import Control.Monad -import qualified Data.ByteString as B -import qualified Data.ByteString.Builder as BB -import qualified Data.ByteString.Char8 as B8 -import qualified Data.ByteString.Internal as BI -import qualified Data.ByteString.Lazy as BL -import Data.ByteString.UTF8 (fromString, toString) -import Data.List -import Data.Maybe -import qualified Data.Vector as V -import Options.Applicative -import Substr -import Text.Regex.TDFA -import Types - -data RedfaOption - = RedfaOptionRules [BS] - | RedfaOptionFile String - deriving (Show) - -data RedfaSpec = - RedfaSpec - { redfaStart :: Int - , redfaNames :: V.Vector BS - , redfaRules :: V.Vector [RedfaRule] --start, mid, jump, token - } - -data RedfaRule = - RedfaRule - { rrRegexStart :: Regex - , rrRegexMid :: Regex - , rrJump :: Int - , rrIsToken :: Bool - } - -redfaOption :: Parser RedfaOption -redfaOption = - RedfaOptionRules <$> - some - (strOption $ - metavar "RULE" <> - short 'L' <> - long "lex" <> help "Lexing rule (specify repeatedly for more rules)") <|> - RedfaOptionFile <$> - strOption - (metavar "FILE" <> - short 'l' <> - long "lexer" <> help "File from where to load the lexing rules") - -redfaOptionToRuleStrings :: RedfaOption -> IO [BS] -redfaOptionToRuleStrings (RedfaOptionRules x) = return x -redfaOptionToRuleStrings (RedfaOptionFile fn) = - B8.lines <$> B.readFile fn -- TODO improve - -splitFirst :: Char -> BS -> (BS, BS) -splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s - -redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool) -redfaRuleStringToRuleStr s = - let (spec, regex) = splitFirst ':' s - (from, to) = splitFirst '>' spec - sf = B8.strip from - (cleanFrom, isToken) - | B.null sf = (sf, True) - | B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False) - | otherwise = (sf, True) - go - | B.null s = Nothing - | B.head s == 35 = Nothing -- # comment - | B.null regex = Just (B.empty, B.empty, spec, isToken) - | B.null to = Just (cleanFrom, cleanFrom, B.tail regex, isToken) - | otherwise = - Just (cleanFrom, B8.strip $ B.tail to, B.tail regex, isToken) - in go - -unescapeRegex :: MonadFail m => BS -> m BS -unescapeRegex s' = BL.toStrict . BB.toLazyByteString <$> unescape' s' - where - unescape' :: MonadFail m => BS -> m BB.Builder - unescape' s - | B.null s = return mempty - | B.head s == BI.c2w '\\' && B.null (B.tail s) = - fail "incomplete escape sequence" - | B.head s == BI.c2w '\\' = - let rest = B.tail s - cc = B.head rest - thechar = - BB.stringUtf8 $ - case BI.w2c cc of - '`' -> "\\`" - '\'' -> "\\'" - 'b' -> "\\b" - 'B' -> "\\B" - '<' -> "\\<" - '>' -> "\\>" - 'a' -> "\a" - 'e' -> "\x1b" - 'f' -> "\f" - 'n' -> "\n" - 'r' -> "\r" - 't' -> "\t" - '\\' -> "\\\\" - a -> [a] --TODO add support for \x and \u - in (thechar <>) <$> unescape' (B.tail rest) - | otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s) - -redfaPrepareRules :: RedfaOption -> IO RedfaSpec -redfaPrepareRules opt = do - (states, jumps, regexes, isToken) <- - unzip4 . mapMaybe redfaRuleStringToRuleStr <$> redfaOptionToRuleStrings opt - uRegexes <- traverse unescapeRegex regexes - startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes - midREs <- traverse (makeRegexM . (fromString "\\`(.|\n)" <>)) uRegexes - let ids = nub . sort $ states ++ jumps - index' = fromJust . flip elemIndex ids - statesIds = map index' states - jumpsIds = map index' jumps - start = index' $ head states - rules :: [[RedfaRule]] - rules = do - stateId <- [0 .. length ids - 1] - return $ do - (a, b, (rs, rm), t) <- - zip4 statesIds jumpsIds (zip startREs midREs) isToken - guard $ a == stateId - return $ RedfaRule rs rm b t - return $ RedfaSpec start (V.fromList ids) (V.fromList rules) - -redfaTokenize :: MonadFail m => RedfaSpec -> BS -> m [Tok] -redfaTokenize spec s = redfaTokenize' spec s (redfaStart spec) 0 [] - -redfaTokenize' :: - MonadFail m => RedfaSpec -> BS -> Int -> Int -> [Int] -> m [Tok] -redfaTokenize' spec s state off visited - | off >= B.length s = pure [] - | otherwise = - let (ooff, reg) = - if off == 0 - then (0, rrRegexStart) - else (1, rrRegexMid) - matchString = B.drop (off - ooff) s - matches :: [(RedfaRule, (MatchOffset, MatchLength))] - matches = - filter contOK $ - zip rules $ map (\x -> match (reg x) matchString) rules - contOK (RedfaRule {rrJump = j}, (off', len)) - | off' /= 0 = False - | len > ooff = True - | otherwise = j `notElem` visited - in case matches of - [] -> - fail $ - "Tokenization could not continue from " <> - stateStrPretty <> " at offset " <> show (off + ooff) - ((rule, (_, len)):_) -> - let matchLen = len - ooff - in (if matchLen > 0 - then (:) (rrIsToken rule, substr off matchLen s) - else id) <$> - redfaTokenize' - spec - s - (rrJump rule) - (off + matchLen) - (if matchLen > 0 - then [] - else state : visited) - where - rules = redfaRules spec V.! state - stateStr = redfaNames spec V.! state - stateStrPretty - | B.null stateStr = "anonymous state" - | otherwise = "state `" <> toString stateStr <> "'" diff --git a/src/Substr.hs b/src/Substr.hs deleted file mode 100644 index e57210c..0000000 --- a/src/Substr.hs +++ /dev/null @@ -1,7 +0,0 @@ -module Substr where - -import qualified Data.ByteString as B -import Types - -substr :: Int -> Int -> BS -> BS -substr b l = B.take l . B.drop b diff --git a/src/Tokenizers.hs b/src/Tokenizers.hs new file mode 100644 index 0000000..a678090 --- /dev/null +++ b/src/Tokenizers.hs @@ -0,0 +1,114 @@ +{-# LANGUAGE Rank2Types #-} +{-# LANGUAGE TupleSections #-} + +module Tokenizers (TokOpts, tokOptions, tokenize) where + +import qualified Data.ByteString as B +import qualified Data.ByteString.UTF8 as BU8 +import qualified Data.Word8 as W8 +import Options.Applicative +import Types +import qualified Unicode.Char.General as UC + +data TokOpts = + TokOpts + { optTokenizerName :: String + } + deriving (Show) + +tokOptions :: Parser TokOpts +tokOptions = + TokOpts <$> + strOption + (metavar "LEXER" <> + short 'l' <> + long "lexer" <> + value "text" <> + help "Lexer name (use --help-lexers to list available ones)") + +--TODO this should later allow choosing the lexer by suffix +tokenize :: MonadFail m => TokOpts -> FilePath -> BS -> m [Tok] +tokenize topt _ = + case filter ((== optTokenizerName topt) . tkName) tokenizers of + [t] -> runTokenizer t + _ -> const $ fail "Lexer not found" + +data Tokenizer = + Tokenizer + { tkName :: String + , tkDescription :: String + , runTokenizer :: forall m. MonadFail m => + BS -> m [Tok] + } + +tokenizers :: [Tokenizer] +tokenizers = + [ Tokenizer "lines" "works like the traditional diff" tokenizeLines + , Tokenizer "asciiwords" "separate by ASCII whitespace" tokenizeWords8 + , Tokenizer "words" "separate on any UTF8 spacing" tokenizeWords + , Tokenizer + "text" + "separate groups of similar-class UTF8 characters" + tokenizeUnicode + ] + +tokenizeLines :: MonadFail m => BS -> m [Tok] +tokenizeLines = pure . map (True, ) . BU8.lines' + +tokenizeWords8 :: MonadFail m => BS -> m [Tok] +tokenizeWords8 = + pure . map (\x -> (not $ W8.isSpace $ B.head x, x)) . bsGroupOn W8.isSpace + where + bsGroupOn f = B.groupBy (\l r -> f l == f r) + +tokenizeWords :: MonadFail m => BS -> m [Tok] +tokenizeWords = pure . map makeTokenBU8 . groupOnBU8 UC.isWhiteSpace + +makeTokenBU8 :: BS -> Tok +makeTokenBU8 s = if + maybe False (not . UC.isWhiteSpace . fst) (BU8.decode s) + then (True, s) + else (False, s) + +{- needed? +foldri :: (Char -> Int -> a -> a) -> a -> BS -> a +foldri cons nil cs = + case decode cs of + Just (a, l) -> cons a l (foldrS cons nil $ drop l cs) + Nothing -> nil +-} + +spanBU8 :: (Char -> Bool) -> BS -> (BS, BS) +spanBU8 f orig = B.splitAt (go 0 orig) orig + where + go :: Int -> BS -> Int + go n bs = + case BU8.decode bs of + Just (a, l) -> + if f a + then go (n + l) (B.drop l bs) + else n + Nothing -> n + +groupByBU8 :: (Char -> Char -> Bool) -> BS -> [BS] +groupByBU8 f s = + case BU8.decode s of + Just (a, _) -> + let (g, s') = spanBU8 (f a) s + in g : groupByBU8 f s' + Nothing -> [] + +groupOnBU8 :: Eq a => (Char -> a) -> BS -> [BS] +groupOnBU8 f = groupByBU8 (\l r -> f l == f r) + +tokenizeUnicode :: MonadFail m => BS -> m [Tok] +tokenizeUnicode = pure . map makeTokenBU8 . groupOnBU8 simpleUnicodeCategory + +simpleUnicodeCategory :: Char -> Int +simpleUnicodeCategory c + | UC.isWhiteSpace c = 1 + | UC.isAlphaNum c = 2 + | UC.isSymbol c = 3 + | UC.isPunctuation c = 4 + | UC.isSeparator c = 5 + | otherwise = 0 diff --git a/src/Types.hs b/src/Types.hs index c2db41d..3b98b85 100644 --- a/src/Types.hs +++ b/src/Types.hs @@ -1,3 +1,5 @@ +--{-# LANGUAGE BangPatterns #-} + module Types where import Data.ByteString