a bit of a revamp
This commit is contained in:
parent
441e2b4351
commit
6e2ab88961
26
README.md
26
README.md
|
@ -12,10 +12,14 @@ standard unix `diff` and related tools.
|
|||
The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who
|
||||
produced a working software package in C++, and designed the Regex-edged DFAs
|
||||
(REDFAs) that were used for user-specifiable tokenization of the input. The
|
||||
work on the corresponding thesis is currently ongoing.
|
||||
work on the corresponding thesis is finished.
|
||||
|
||||
This started as a simple Haskell port of that work, and packed some relatively
|
||||
orthogonal improvements (mainly the histogram-style diffing).
|
||||
orthogonal improvements (mainly the histogram-style diffing). I later got rid
|
||||
of the REDFA concept -- while super-interesting and useful in theory, I didn't
|
||||
find a sufficiently universal way to build good lexers from user-specified
|
||||
strings. Having a proper Regex representation library (so that e.g.
|
||||
reconstructing Flex is easy) would help a lot.
|
||||
|
||||
### TODO list
|
||||
|
||||
|
@ -23,16 +27,12 @@ orthogonal improvements (mainly the histogram-style diffing).
|
|||
of hunk context. `diff` and `diff3` works.
|
||||
- Implement a splitting heuristic for diffs, so that diffing of large files
|
||||
doesn't take aeons
|
||||
- Check whether REDFA can even be implemented correctly with current Haskell
|
||||
libraries (most regex libraries target a completely different). Taking the
|
||||
lexer specification format from `alex` currently seems like a much better
|
||||
option. Deferring the task unix-ishly to another program could work too.
|
||||
- check if we can have external lexers, unix-style
|
||||
|
||||
# How-To
|
||||
|
||||
Install using `cabal`. The `adiff` program has 3 sub-commands that work like
|
||||
`diff`, `patch` and `diff3`. It expects a lexing specification on the input;
|
||||
there are several very simple example lexers in `lexers/`.
|
||||
`diff`, `patch` and `diff3`.
|
||||
|
||||
## Example
|
||||
|
||||
|
@ -49,15 +49,11 @@ Patching is hard. I still cannot rhyme.
|
|||
```
|
||||
|
||||
Let's use the `words` lexer, which marks everything whitespace-ish as
|
||||
whitespace, and picks up groups of non-whitespace "content" characters:
|
||||
```
|
||||
:[^ \t\n]*
|
||||
_:[ \t\n]*
|
||||
```
|
||||
whitespace, and picks up groups of non-whitespace "content" characters.
|
||||
|
||||
Diffing the 2 files gets done as such:
|
||||
```
|
||||
$ cabal run adiff -- -l lexers/words diff orig mine
|
||||
$ cabal run adiff -- -l words diff orig mine
|
||||
```
|
||||
|
||||
You should get something like this:
|
||||
|
@ -99,7 +95,7 @@ I cannot do verses.
|
|||
We can run `diff3` to get a patch with both changes, optionally with reduced
|
||||
context:
|
||||
```
|
||||
$ cabal run adiff -- -l lexers/words diff3 mine orig yours -C1
|
||||
$ cabal run adiff -- -l words diff3 mine orig yours -C1
|
||||
```
|
||||
...which outputs:
|
||||
```
|
||||
|
|
11
adiff.cabal
11
adiff.cabal
|
@ -51,8 +51,7 @@ executable adiff
|
|||
Hunks,
|
||||
Merge,
|
||||
Patch,
|
||||
Redfa,
|
||||
Substr,
|
||||
Tokenizers,
|
||||
Types,
|
||||
Version
|
||||
|
||||
|
@ -60,13 +59,15 @@ executable adiff
|
|||
other-extensions: CPP
|
||||
|
||||
-- Other library packages from which modules are imported.
|
||||
build-depends: base ^>=4.13.0.0,
|
||||
build-depends: base ^>=4.15.0.0,
|
||||
attoparsec ^>=0.14,
|
||||
extra ^>= 1.7,
|
||||
mmap ^>=0.5,
|
||||
regex-tdfa ^>= 1.3,
|
||||
optparse-applicative ^>=0.16,
|
||||
bytestring ^>= 0.10.12,
|
||||
bytestring ^>= 0.11.2,
|
||||
vector ^>=0.12,
|
||||
word8 ^>=0.1,
|
||||
unicode-data ^>=0.3,
|
||||
utf8-string ^>=1.0
|
||||
|
||||
-- Directories containing source files.
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
[a-z]
|
||||
_:[ \n]
|
|
@ -1,2 +0,0 @@
|
|||
[^\n]*\n
|
||||
[^\n]*
|
|
@ -1,2 +0,0 @@
|
|||
[0-9]
|
||||
_:\n
|
|
@ -1,2 +0,0 @@
|
|||
:[^ \t\n]*
|
||||
_:[ \t\n]*
|
28
src/Diff.hs
28
src/Diff.hs
|
@ -20,9 +20,9 @@ data DiffEnv =
|
|||
, deB :: Int
|
||||
, deVS :: V.Vector (Int, Int)
|
||||
, deVE :: V.Vector (Int, Int)
|
||||
, deTokPrio :: Tok -> Int
|
||||
, deTrans :: Bool
|
||||
}
|
||||
deriving (Show)
|
||||
|
||||
toksMatch :: Int -> Int -> DiffEnv -> Bool
|
||||
toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y
|
||||
|
@ -47,10 +47,19 @@ stripEqToks t1 t2 = (pre, post, t1', t2')
|
|||
t1' = V.slice b (l1 - e - b) t1
|
||||
t2' = V.slice b (l2 - e - b) t2
|
||||
|
||||
makePrios :: TV -> TV -> (Bool, BS) -> Int
|
||||
makePrios _ _ = get
|
||||
where
|
||||
get (isToken, str) =
|
||||
if isToken
|
||||
then B.length str
|
||||
else 0
|
||||
|
||||
diffToks :: TV -> TV -> Diff
|
||||
diffToks t1' t2' = pre ++ res ++ post
|
||||
where
|
||||
(pre, post, t1, t2) = stripEqToks t1' t2'
|
||||
stats = makePrios t1' t2'
|
||||
res
|
||||
| V.null t1 = map (Add, ) (V.toList t2)
|
||||
| V.null t2 = map (Remove, ) (V.toList t1)
|
||||
|
@ -67,6 +76,7 @@ diffToks t1' t2' = pre ++ res ++ post
|
|||
, deB = V.length t2
|
||||
, deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0)
|
||||
, deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0)
|
||||
, deTokPrio = stats
|
||||
, deTrans = False
|
||||
}
|
||||
| otherwise =
|
||||
|
@ -82,6 +92,7 @@ diffToks t1' t2' = pre ++ res ++ post
|
|||
, deB = V.length t1
|
||||
, deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0)
|
||||
, deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0)
|
||||
, deTokPrio = stats
|
||||
, deTrans = True
|
||||
}
|
||||
|
||||
|
@ -110,12 +121,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
|||
mid = quot (s + e) 2
|
||||
vecSmid = vecS mid
|
||||
vecEmid = vecE mid
|
||||
extraScore i =
|
||||
if isToken
|
||||
then -(B.length str)
|
||||
else 0
|
||||
where
|
||||
(isToken, str) = deT1 de V.! i
|
||||
prio i = negate . deTokPrio de $ deT1 de V.! i
|
||||
vecS = vec -- "forward" operation
|
||||
where
|
||||
vec i
|
||||
|
@ -131,7 +137,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
|||
(iupleft, supleft) = v V.! pred j
|
||||
keep
|
||||
| toksMatch (pred i) (pred j) de =
|
||||
min (iupleft, supleft + extraScore (pred i))
|
||||
min (iupleft, supleft + prio (pred i))
|
||||
| otherwise = id
|
||||
res = keep $ min (succ iup, sup) (succ ileft, sleft)
|
||||
in res : go (succ j) res
|
||||
|
@ -150,7 +156,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
|||
(idownright, sdownright) = v V.! succ j
|
||||
keep
|
||||
| toksMatch i j de =
|
||||
min (idownright, sdownright + extraScore i)
|
||||
min (idownright, sdownright + prio i)
|
||||
| otherwise = id
|
||||
res = keep $ min (succ idown, sdown) (succ iright, sright)
|
||||
in res : go (pred j) res
|
||||
|
@ -213,7 +219,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
|||
, fst (vecLS V.! i) == fst sCost - a + i
|
||||
, sumL V.! i == totalCost
|
||||
, if doKeep
|
||||
then scoreAdd (vecLS V.! i) (0, extraScore s) ==
|
||||
then scoreAdd (vecLS V.! i) (0, prio s) ==
|
||||
vecRS V.! succ i
|
||||
else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i
|
||||
, if doKeep
|
||||
|
@ -223,7 +229,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
|||
]
|
||||
jumpEnd =
|
||||
if doKeep
|
||||
then jumpPos + 1
|
||||
then succ jumpPos
|
||||
else jumpPos
|
||||
in map
|
||||
(\i ->
|
||||
|
|
|
@ -1,30 +1,39 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Format
|
||||
( pprHunks
|
||||
, pprHunk
|
||||
, pprDiff1
|
||||
, parsePatch
|
||||
) where
|
||||
|
||||
import Types
|
||||
|
||||
import Control.Applicative
|
||||
import Control.Monad
|
||||
import Data.Attoparsec.ByteString as A
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Builder as BB
|
||||
import qualified Data.ByteString.Internal as BI
|
||||
import qualified Data.ByteString.Lazy as BL
|
||||
import Data.String
|
||||
import Data.Word8 as W8
|
||||
|
||||
pprHunkHdr :: Int -> Int -> BB.Builder
|
||||
pprHunkHdr i j = fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@"
|
||||
backslash :: Word8
|
||||
backslash = BI.c2w '\\'
|
||||
|
||||
lineSep :: BB.Builder
|
||||
lineSep = fromString "\n"
|
||||
newline :: Word8
|
||||
newline = BI.c2w '\n'
|
||||
|
||||
pprHunks :: [Hunk] -> BB.Builder
|
||||
pprHunks = mconcat . map pprHunk
|
||||
|
||||
lineSep :: BB.Builder
|
||||
lineSep = BB.word8 newline
|
||||
|
||||
pprHunkHdr :: Int -> Int -> BB.Builder
|
||||
pprHunkHdr i j =
|
||||
(fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@") <> lineSep
|
||||
|
||||
pprHunk :: Hunk -> BB.Builder
|
||||
pprHunk ((i, j), toks) =
|
||||
mconcat ((pprHunkHdr i j <> lineSep) : map pprDiff1 toks)
|
||||
pprHunk ((i, j), toks) = mconcat (pprHunkHdr i j : map pprDiff1 toks)
|
||||
|
||||
pprDiff1 :: (Op, Tok) -> BB.Builder
|
||||
pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
|
||||
|
@ -46,6 +55,74 @@ pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
|
|||
escNewlines :: BS -> BB.Builder
|
||||
escNewlines s
|
||||
| B.null s = mempty
|
||||
| B.head s == BI.c2w '\n' = BB.stringUtf8 "\\n" <> escNewlines (B.tail s)
|
||||
| B.head s == BI.c2w '\\' = BB.stringUtf8 "\\\\" <> escNewlines (B.tail s)
|
||||
| B.head s == newline =
|
||||
BB.word8 backslash <> BB.word8 (BI.c2w 'n') <> escNewlines (B.tail s)
|
||||
| B.head s == backslash =
|
||||
BB.word8 backslash <> BB.word8 backslash <> escNewlines (B.tail s)
|
||||
| otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s)
|
||||
|
||||
--parsePatch :: BS -> Either String [Hunk]
|
||||
parsePatch = parseOnly parseHunks
|
||||
|
||||
parseHunks :: Parser [Hunk]
|
||||
parseHunks = many parseHunk <* endOfInput
|
||||
|
||||
parseHunk :: Parser Hunk
|
||||
parseHunk = liftA2 (,) parseHunkHdr (many parseDiff1)
|
||||
|
||||
parseInt :: Parser Int
|
||||
parseInt = read . map BI.w2c <$> some (satisfy W8.isDigit)
|
||||
|
||||
eol :: Parser ()
|
||||
eol = void $ word8 newline
|
||||
|
||||
parseHunkHdr :: Parser (Int, Int)
|
||||
parseHunkHdr = do
|
||||
void . string $ fromString "@@ -"
|
||||
i <- parseInt
|
||||
void . string $ fromString " +"
|
||||
j <- parseInt
|
||||
void . string $ fromString " @@"
|
||||
eol
|
||||
return (i, j)
|
||||
|
||||
pairs2parsers :: [(a, Char)] -> [Parser a]
|
||||
pairs2parsers = map (\(x, ch) -> x <$ word8 (BI.c2w ch))
|
||||
|
||||
parseOpList :: [Parser Op]
|
||||
parseOpList =
|
||||
pairs2parsers
|
||||
[ (Add, '+')
|
||||
, (Keep, ' ')
|
||||
, (Remove, '-')
|
||||
, (MineChanged, '<')
|
||||
, (Original, '=')
|
||||
, (YourChanged, '>')
|
||||
]
|
||||
|
||||
parseOp :: Parser Op
|
||||
parseOp = choice parseOpList
|
||||
|
||||
parseTokMarkList :: [Parser Bool]
|
||||
parseTokMarkList = pairs2parsers [(True, '|'), (False, '.')]
|
||||
|
||||
parseTokMark :: Parser Bool
|
||||
parseTokMark = choice parseTokMarkList
|
||||
|
||||
parseTokBS :: Parser BS
|
||||
parseTokBS =
|
||||
(BL.toStrict . BB.toLazyByteString . mconcat <$> many parseTokChar) <* eol
|
||||
|
||||
parseTokChar :: Parser BB.Builder
|
||||
parseTokChar =
|
||||
choice
|
||||
[ BB.word8 newline <$ string (fromString "\\n")
|
||||
, BB.word8 backslash <$ string (fromString "\\\\")
|
||||
, BB.word8 <$> satisfy (\w -> w /= backslash && w /= newline)
|
||||
]
|
||||
|
||||
parseTok :: Parser Tok
|
||||
parseTok = liftA2 (,) parseTokMark parseTokBS
|
||||
|
||||
parseDiff1 :: Parser (Op, Tok)
|
||||
parseDiff1 = liftA2 (,) parseOp parseTok
|
||||
|
|
200
src/Main.hs
200
src/Main.hs
|
@ -1,6 +1,10 @@
|
|||
module Main where
|
||||
|
||||
import Control.Exception
|
||||
import Control.Monad
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Builder as BB
|
||||
import Data.Foldable (traverse_)
|
||||
import qualified Data.Vector as V
|
||||
import Diff
|
||||
import Diff3
|
||||
|
@ -8,15 +12,17 @@ import Format
|
|||
import Hunks
|
||||
import Merge
|
||||
import Options.Applicative
|
||||
import Redfa
|
||||
import System.IO (stdout)
|
||||
import Patch
|
||||
import System.Exit
|
||||
import System.IO (hPutStrLn, stderr, stdout)
|
||||
import System.IO.MMap
|
||||
import Tokenizers
|
||||
import Types
|
||||
import Version
|
||||
|
||||
data ADiffOptions =
|
||||
ADiffOptions
|
||||
{ adiffRedfaOpt :: RedfaOption
|
||||
{ adiffTokOpts :: TokOpts
|
||||
, adiffCmdOpts :: ADiffCommandOpts
|
||||
}
|
||||
deriving (Show)
|
||||
|
@ -29,11 +35,16 @@ data ADiffCommandOpts
|
|||
}
|
||||
| CmdPatch
|
||||
{ patchDryRun :: Bool
|
||||
, patchInDir :: Maybe String
|
||||
, patchInput :: String
|
||||
--, patchInDir :: Maybe String
|
||||
--, patchPathStrip :: Int
|
||||
, patchInputPatch :: String
|
||||
, patchOutput :: String
|
||||
, patchReject :: String --todo convert to Maybes with optional
|
||||
, patchBackup :: String
|
||||
, patchReverse :: Bool
|
||||
, patchPathStrip :: Int
|
||||
, context :: Int
|
||||
, patchMergeOpts :: MergeOpts
|
||||
, patchInput :: String
|
||||
}
|
||||
| CmdDiff3
|
||||
{ context :: Int
|
||||
|
@ -44,15 +55,22 @@ data ADiffCommandOpts
|
|||
}
|
||||
deriving (Show)
|
||||
|
||||
contextOpt :: Parser Int
|
||||
contextOpt =
|
||||
contextOpt :: Bool -> Parser Int
|
||||
contextOpt forPatch =
|
||||
check <$>
|
||||
option
|
||||
auto
|
||||
(metavar "CONTEXT" <>
|
||||
short 'C' <>
|
||||
long "context" <>
|
||||
value 5 <> help "How many tokens around the change to include in the patch")
|
||||
value
|
||||
(if forPatch
|
||||
then 4
|
||||
else 8) <>
|
||||
help
|
||||
(if forPatch
|
||||
then "Minimum amount of context tokens that must match so that the hunk is applied"
|
||||
else "How many tokens around the change to include in the patch"))
|
||||
where
|
||||
check c
|
||||
| c < 0 = error "Negative context"
|
||||
|
@ -60,7 +78,7 @@ contextOpt =
|
|||
|
||||
diffCmdOptions :: Parser ADiffCommandOpts
|
||||
diffCmdOptions =
|
||||
CmdDiff <$> contextOpt <*> strArgument (metavar "FROMFILE") <*>
|
||||
CmdDiff <$> contextOpt False <*> strArgument (metavar "FROMFILE") <*>
|
||||
strArgument (metavar "TOFILE")
|
||||
|
||||
patchCmdOptions :: Parser ADiffCommandOpts
|
||||
|
@ -70,33 +88,49 @@ patchCmdOptions =
|
|||
(short 'n' <>
|
||||
long "dry-run" <>
|
||||
help "Do not patch anything, just print what would happen") <*>
|
||||
optional
|
||||
(strOption $
|
||||
short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
|
||||
-- optional (strOption $ short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
|
||||
--option auto (short 'p' <> long "strip" <> metavar "NUM" <> help "Strip NUM leading components from the paths" <> value 0) <*>
|
||||
strOption
|
||||
(short 'i' <>
|
||||
long "input" <>
|
||||
metavar "INPUT" <>
|
||||
help "Read the patchfile from INPUT, defaults to `-' for STDIN" <>
|
||||
metavar "PATCHFILE" <>
|
||||
help "Read the patchfile from PATCHFILE, defaults to `-' for STDIN" <>
|
||||
value "-") <*>
|
||||
strOption
|
||||
(short 'o' <>
|
||||
long "output" <>
|
||||
metavar "OUTPUT" <>
|
||||
help
|
||||
"Write the patched file to OUTPUT, use `-' for STDOUT. By default, INPUT is rewritten." <>
|
||||
value "") <*>
|
||||
strOption
|
||||
(short 'r' <>
|
||||
long "reject" <>
|
||||
metavar "REJECTS" <>
|
||||
help
|
||||
"Write the rejected hunks file to file REJECTS, instead of default `OUTPUT.rej'. Use `-' to discard rejects." <>
|
||||
value "") <*>
|
||||
strOption
|
||||
(short 'b' <>
|
||||
long "backup" <>
|
||||
metavar "BACKUP" <>
|
||||
help
|
||||
"When rewriting INPUT after a partially applied or otherwise suspicious patch, back up the original file in BACKUP instead of default `INPUT.orig'. Use `-' to discard backups." <>
|
||||
value "") <*>
|
||||
switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*>
|
||||
option
|
||||
auto
|
||||
(short 'p' <>
|
||||
long "strip" <>
|
||||
metavar "NUM" <>
|
||||
help "Strip NUM leading components from the paths" <> value 0) <*>
|
||||
mergeOption True
|
||||
contextOpt True <*>
|
||||
mergeOption True <*>
|
||||
strArgument (metavar "INPUT")
|
||||
|
||||
diff3CmdOptions :: Parser ADiffCommandOpts
|
||||
diff3CmdOptions =
|
||||
CmdDiff3 <$> contextOpt <*> strArgument (metavar "MYFILE") <*>
|
||||
CmdDiff3 <$> contextOpt False <*> strArgument (metavar "MYFILE") <*>
|
||||
strArgument (metavar "OLDFILE") <*>
|
||||
strArgument (metavar "YOURFILE") <*>
|
||||
mergeOption False
|
||||
|
||||
actionOption :: Parser ADiffCommandOpts
|
||||
actionOption =
|
||||
actionOptions :: Parser ADiffCommandOpts
|
||||
actionOptions =
|
||||
hsubparser $
|
||||
mconcat
|
||||
[ command "diff" $ info diffCmdOptions $ progDesc "Compare two files"
|
||||
|
@ -106,36 +140,106 @@ actionOption =
|
|||
]
|
||||
|
||||
adiffOptions :: Parser ADiffOptions
|
||||
adiffOptions = ADiffOptions <$> redfaOption <*> actionOption
|
||||
adiffOptions = ADiffOptions <$> tokOptions <*> actionOptions
|
||||
|
||||
loadToks :: RedfaSpec -> FilePath -> IO TV
|
||||
loadToks redfa f =
|
||||
V.fromList <$> (mmapFileByteString f Nothing >>= redfaTokenize redfa)
|
||||
-- TODO: load in case it's not a regular file
|
||||
loadToks :: TokOpts -> FilePath -> IO TV
|
||||
loadToks topt f =
|
||||
V.fromList <$> (mmapFileByteString f Nothing >>= tokenize topt f)
|
||||
|
||||
main :: IO ()
|
||||
main =
|
||||
doDiff :: TokOpts -> ADiffCommandOpts -> IO ()
|
||||
doDiff topt (CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt}) = do
|
||||
[toks1, toks2] <- traverse (loadToks topt) [f1, f2]
|
||||
let output = hunks (max 0 ctxt) $ diffToks toks1 toks2
|
||||
BB.hPutBuilder stdout $ pprHunks output
|
||||
unless (null output) $ exitWith (ExitFailure 1)
|
||||
doDiff _ _ = error "dispatch failure"
|
||||
|
||||
doDiff3 :: TokOpts -> ADiffCommandOpts -> IO ()
|
||||
doDiff3 topt (CmdDiff3 ctxt f1 f2 f3 mo) = do
|
||||
[toksMine, toksOld, toksYour] <- traverse (loadToks topt) [f1, f2, f3]
|
||||
let d3 = diff3Toks mo toksMine toksOld toksYour
|
||||
isConflict (MineChanged, _) = True
|
||||
isConflict (YourChanged, _) = True
|
||||
isConflict _ = False
|
||||
hasConflict = any isConflict d3
|
||||
BB.hPutBuilder stdout $
|
||||
if mergeDoMerge mo
|
||||
then do
|
||||
fmtMerged mo d3
|
||||
else pprHunks $ hunks (max 0 ctxt) d3
|
||||
when hasConflict $ exitWith (ExitFailure 1)
|
||||
doDiff3 _ _ = error "dispatch failure"
|
||||
|
||||
doPatch :: TokOpts -> ADiffCommandOpts -> IO ()
|
||||
doPatch topt o = do
|
||||
toksIn <- loadToks topt (patchInput o)
|
||||
hs' <-
|
||||
parsePatch <$>
|
||||
case (patchInputPatch o) of
|
||||
"-" -> B.getContents
|
||||
fn -> B.readFile fn
|
||||
hs <-
|
||||
case hs' of
|
||||
Left _ -> ioError $ userError "PATCHFILE parsing failed"
|
||||
Right x -> pure x
|
||||
let (toks, rej, warns) =
|
||||
patchToks toksIn hs (patchReverse o) (patchMergeOpts o)
|
||||
sus = not (null warns)
|
||||
rewritingInput = null (patchOutput o)
|
||||
outputStdout = patchOutput o == "-"
|
||||
rejfile
|
||||
| rewritingInput || outputStdout = patchInput o ++ ".rej"
|
||||
| otherwise = patchOutput o ++ ".rej"
|
||||
backupfile
|
||||
| patchBackup o == "-" = ""
|
||||
| patchBackup o == "" && rewritingInput = patchInput o ++ ".orig"
|
||||
| otherwise = patchBackup o
|
||||
traverse_ (hPutStrLn stderr . pprPatchWarn) warns
|
||||
if patchDryRun o
|
||||
then do
|
||||
hPutStrLn stderr $
|
||||
(if sus
|
||||
then "OK"
|
||||
else "Possibly problematic") ++
|
||||
" patch with " ++ show (length rej :: Int) ++ " rejected hunks"
|
||||
BB.hPutBuilder stdout (pprHunks rej)
|
||||
else do
|
||||
when (not $ null rej) $ BB.writeFile rejfile (pprHunks rej)
|
||||
when (sus && not (null backupfile)) $
|
||||
B.readFile (patchInput o) >>= B.writeFile backupfile
|
||||
($ mconcat $ map (BB.byteString . snd) $ V.toList toks) $
|
||||
if outputStdout
|
||||
then BB.hPutBuilder stdout
|
||||
else BB.writeFile
|
||||
(if rewritingInput
|
||||
then patchInput o
|
||||
else patchOutput o)
|
||||
when sus $ exitWith (ExitFailure 1)
|
||||
|
||||
main' :: IO ()
|
||||
main' =
|
||||
let opts :: ParserInfo ADiffOptions
|
||||
opts =
|
||||
info
|
||||
(adiffOptions <**> versionOption "adiff" <**> helperOption)
|
||||
(fullDesc <>
|
||||
progDesc
|
||||
"Compare, patch and merge files on arbitrarily-tokenized sequences." <>
|
||||
"Compare, patch and merge files on arbitrarily tokenized sequences." <>
|
||||
header "adiff: arbitrary-token diff utilities")
|
||||
in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
|
||||
in do ADiffOptions {adiffTokOpts = topt, adiffCmdOpts = copt} <-
|
||||
customExecParser (prefs $ helpShowGlobals <> subparserInline) opts
|
||||
redfa <- redfaPrepareRules ropt
|
||||
case copt of
|
||||
CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
|
||||
[toks1, toks2] <- traverse (loadToks redfa) [f1, f2]
|
||||
BB.hPutBuilder stdout $
|
||||
pprHunks $ hunks (max 0 ctxt) $ diffToks toks1 toks2
|
||||
CmdPatch {} -> putStrLn "not supported yet"
|
||||
CmdDiff3 ctxt f1 f2 f3 mo -> do
|
||||
[toksMine, toksOld, toksYour] <-
|
||||
traverse (loadToks redfa) [f1, f2, f3]
|
||||
let d3 = diff3Toks mo toksMine toksOld toksYour
|
||||
BB.hPutBuilder stdout $
|
||||
if mergeDoMerge mo
|
||||
then fmtMerged mo d3
|
||||
else pprHunks $ hunks (max 0 ctxt) d3
|
||||
(case copt of
|
||||
CmdDiff {} -> doDiff
|
||||
CmdDiff3 {} -> doDiff3
|
||||
CmdPatch {} -> doPatch)
|
||||
topt
|
||||
copt
|
||||
|
||||
main :: IO ()
|
||||
main =
|
||||
main' `catch`
|
||||
(\e -> do
|
||||
let err = show (e :: IOException)
|
||||
hPutStrLn stderr err
|
||||
exitWith $ ExitFailure 2)
|
||||
|
|
|
@ -33,7 +33,7 @@ mergeOption forPatch =
|
|||
switch
|
||||
(short 'a' <>
|
||||
long "add-linebreak" <>
|
||||
help "Automatically add a line break after conflict markers") <*>
|
||||
help "Automatically add a line break after conflict markers. Useful with `lines' lexer.") <*>
|
||||
mo)
|
||||
where
|
||||
marker = fromString . replicate 7
|
||||
|
@ -44,7 +44,7 @@ mergeOption forPatch =
|
|||
long "merge" <>
|
||||
help
|
||||
(if forPatch
|
||||
then "Merge using conflict markers instead of printing the rejected hunks"
|
||||
then "Instead of printing the rejected thunks, merge using conflict markers as if the INPUT was `MYFILE' and the patch would produced `YOURFILE' from the original."
|
||||
else "Output the merged file instead of the patch")) <*>
|
||||
switch
|
||||
(short 'i' <>
|
||||
|
@ -69,11 +69,11 @@ mergeOption forPatch =
|
|||
help
|
||||
("On whitespace mismatch, output the version from " ++
|
||||
(if forPatch
|
||||
then "original file"
|
||||
then "the original file"
|
||||
else "MYFILE") ++
|
||||
" instead of the one from " ++
|
||||
(if forPatch
|
||||
then "patch"
|
||||
then "the context in patch"
|
||||
else "YOURFILE"))) <*>
|
||||
strOption
|
||||
(long "merge-start" <>
|
||||
|
|
32
src/Patch.hs
32
src/Patch.hs
|
@ -1,4 +1,30 @@
|
|||
module Patch where
|
||||
module Patch (patchToks, pprPatchWarn) where
|
||||
|
||||
patchToks :: a
|
||||
patchToks = undefined
|
||||
import Format
|
||||
import Types
|
||||
import Merge
|
||||
import qualified Data.ByteString as B
|
||||
|
||||
data PatchWarn = HunkMatched Hunk Int | HunkFailed Hunk
|
||||
|
||||
pprPatchWarn :: PatchWarn -> String
|
||||
pprPatchWarn (HunkMatched ((o,n), _) at) = "hunk (-"++ show o ++ " +"++show n++") succeeded at "++show at
|
||||
pprPatchWarn (HunkFailed ((o,n),_)) = "hunk (-"++ show o ++ " +"++show n++") FAILED"
|
||||
|
||||
patchToks :: TV -> [Hunk] -> Bool -> MergeOpts -> (TV, [Hunk], [PatchWarn])
|
||||
patchToks toks hunks rev mopt = undefined
|
||||
--let (tokss, hunk, sus) = patchHunksAt 0 0 0 (if rev then map revHunk hunks else hunks) mopt
|
||||
|
||||
|
||||
patchHunksAt :: TV -> Int -> Int -> Int -> [Hunk] -> MergeOpts -> ([TV], [PatchWarn])
|
||||
patchHunksAt toks tvoff origoff newoff hunks mopt = undefined
|
||||
|
||||
hunkToMatch :: Diff -> Diff
|
||||
hunkToMatch = filter $ (`elem` [Keep, Remove, Original]) . fst
|
||||
|
||||
hunkToReplace :: Diff -> Diff
|
||||
hunkToReplace = filter $ (`elem` [Keep, Add, MineChanged, YourChanged]) . fst
|
||||
|
||||
tokCmp :: Bool -> Tok -> Tok -> Bool
|
||||
tokCmp True (False, _) (False, _) = True -- ignore the whitespace change
|
||||
tokCmp _ a b = a == b -- otherwise just compare
|
||||
|
|
177
src/Redfa.hs
177
src/Redfa.hs
|
@ -1,177 +0,0 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
|
||||
module Redfa where
|
||||
|
||||
import Control.Monad
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Builder as BB
|
||||
import qualified Data.ByteString.Char8 as B8
|
||||
import qualified Data.ByteString.Internal as BI
|
||||
import qualified Data.ByteString.Lazy as BL
|
||||
import Data.ByteString.UTF8 (fromString, toString)
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
import qualified Data.Vector as V
|
||||
import Options.Applicative
|
||||
import Substr
|
||||
import Text.Regex.TDFA
|
||||
import Types
|
||||
|
||||
data RedfaOption
|
||||
= RedfaOptionRules [BS]
|
||||
| RedfaOptionFile String
|
||||
deriving (Show)
|
||||
|
||||
data RedfaSpec =
|
||||
RedfaSpec
|
||||
{ redfaStart :: Int
|
||||
, redfaNames :: V.Vector BS
|
||||
, redfaRules :: V.Vector [RedfaRule] --start, mid, jump, token
|
||||
}
|
||||
|
||||
data RedfaRule =
|
||||
RedfaRule
|
||||
{ rrRegexStart :: Regex
|
||||
, rrRegexMid :: Regex
|
||||
, rrJump :: Int
|
||||
, rrIsToken :: Bool
|
||||
}
|
||||
|
||||
redfaOption :: Parser RedfaOption
|
||||
redfaOption =
|
||||
RedfaOptionRules <$>
|
||||
some
|
||||
(strOption $
|
||||
metavar "RULE" <>
|
||||
short 'L' <>
|
||||
long "lex" <> help "Lexing rule (specify repeatedly for more rules)") <|>
|
||||
RedfaOptionFile <$>
|
||||
strOption
|
||||
(metavar "FILE" <>
|
||||
short 'l' <>
|
||||
long "lexer" <> help "File from where to load the lexing rules")
|
||||
|
||||
redfaOptionToRuleStrings :: RedfaOption -> IO [BS]
|
||||
redfaOptionToRuleStrings (RedfaOptionRules x) = return x
|
||||
redfaOptionToRuleStrings (RedfaOptionFile fn) =
|
||||
B8.lines <$> B.readFile fn -- TODO improve
|
||||
|
||||
splitFirst :: Char -> BS -> (BS, BS)
|
||||
splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
|
||||
|
||||
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
|
||||
redfaRuleStringToRuleStr s =
|
||||
let (spec, regex) = splitFirst ':' s
|
||||
(from, to) = splitFirst '>' spec
|
||||
sf = B8.strip from
|
||||
(cleanFrom, isToken)
|
||||
| B.null sf = (sf, True)
|
||||
| B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
|
||||
| otherwise = (sf, True)
|
||||
go
|
||||
| B.null s = Nothing
|
||||
| B.head s == 35 = Nothing -- # comment
|
||||
| B.null regex = Just (B.empty, B.empty, spec, isToken)
|
||||
| B.null to = Just (cleanFrom, cleanFrom, B.tail regex, isToken)
|
||||
| otherwise =
|
||||
Just (cleanFrom, B8.strip $ B.tail to, B.tail regex, isToken)
|
||||
in go
|
||||
|
||||
unescapeRegex :: MonadFail m => BS -> m BS
|
||||
unescapeRegex s' = BL.toStrict . BB.toLazyByteString <$> unescape' s'
|
||||
where
|
||||
unescape' :: MonadFail m => BS -> m BB.Builder
|
||||
unescape' s
|
||||
| B.null s = return mempty
|
||||
| B.head s == BI.c2w '\\' && B.null (B.tail s) =
|
||||
fail "incomplete escape sequence"
|
||||
| B.head s == BI.c2w '\\' =
|
||||
let rest = B.tail s
|
||||
cc = B.head rest
|
||||
thechar =
|
||||
BB.stringUtf8 $
|
||||
case BI.w2c cc of
|
||||
'`' -> "\\`"
|
||||
'\'' -> "\\'"
|
||||
'b' -> "\\b"
|
||||
'B' -> "\\B"
|
||||
'<' -> "\\<"
|
||||
'>' -> "\\>"
|
||||
'a' -> "\a"
|
||||
'e' -> "\x1b"
|
||||
'f' -> "\f"
|
||||
'n' -> "\n"
|
||||
'r' -> "\r"
|
||||
't' -> "\t"
|
||||
'\\' -> "\\\\"
|
||||
a -> [a] --TODO add support for \x and \u
|
||||
in (thechar <>) <$> unescape' (B.tail rest)
|
||||
| otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
|
||||
|
||||
redfaPrepareRules :: RedfaOption -> IO RedfaSpec
|
||||
redfaPrepareRules opt = do
|
||||
(states, jumps, regexes, isToken) <-
|
||||
unzip4 . mapMaybe redfaRuleStringToRuleStr <$> redfaOptionToRuleStrings opt
|
||||
uRegexes <- traverse unescapeRegex regexes
|
||||
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
|
||||
midREs <- traverse (makeRegexM . (fromString "\\`(.|\n)" <>)) uRegexes
|
||||
let ids = nub . sort $ states ++ jumps
|
||||
index' = fromJust . flip elemIndex ids
|
||||
statesIds = map index' states
|
||||
jumpsIds = map index' jumps
|
||||
start = index' $ head states
|
||||
rules :: [[RedfaRule]]
|
||||
rules = do
|
||||
stateId <- [0 .. length ids - 1]
|
||||
return $ do
|
||||
(a, b, (rs, rm), t) <-
|
||||
zip4 statesIds jumpsIds (zip startREs midREs) isToken
|
||||
guard $ a == stateId
|
||||
return $ RedfaRule rs rm b t
|
||||
return $ RedfaSpec start (V.fromList ids) (V.fromList rules)
|
||||
|
||||
redfaTokenize :: MonadFail m => RedfaSpec -> BS -> m [Tok]
|
||||
redfaTokenize spec s = redfaTokenize' spec s (redfaStart spec) 0 []
|
||||
|
||||
redfaTokenize' ::
|
||||
MonadFail m => RedfaSpec -> BS -> Int -> Int -> [Int] -> m [Tok]
|
||||
redfaTokenize' spec s state off visited
|
||||
| off >= B.length s = pure []
|
||||
| otherwise =
|
||||
let (ooff, reg) =
|
||||
if off == 0
|
||||
then (0, rrRegexStart)
|
||||
else (1, rrRegexMid)
|
||||
matchString = B.drop (off - ooff) s
|
||||
matches :: [(RedfaRule, (MatchOffset, MatchLength))]
|
||||
matches =
|
||||
filter contOK $
|
||||
zip rules $ map (\x -> match (reg x) matchString) rules
|
||||
contOK (RedfaRule {rrJump = j}, (off', len))
|
||||
| off' /= 0 = False
|
||||
| len > ooff = True
|
||||
| otherwise = j `notElem` visited
|
||||
in case matches of
|
||||
[] ->
|
||||
fail $
|
||||
"Tokenization could not continue from " <>
|
||||
stateStrPretty <> " at offset " <> show (off + ooff)
|
||||
((rule, (_, len)):_) ->
|
||||
let matchLen = len - ooff
|
||||
in (if matchLen > 0
|
||||
then (:) (rrIsToken rule, substr off matchLen s)
|
||||
else id) <$>
|
||||
redfaTokenize'
|
||||
spec
|
||||
s
|
||||
(rrJump rule)
|
||||
(off + matchLen)
|
||||
(if matchLen > 0
|
||||
then []
|
||||
else state : visited)
|
||||
where
|
||||
rules = redfaRules spec V.! state
|
||||
stateStr = redfaNames spec V.! state
|
||||
stateStrPretty
|
||||
| B.null stateStr = "anonymous state"
|
||||
| otherwise = "state `" <> toString stateStr <> "'"
|
|
@ -1,7 +0,0 @@
|
|||
module Substr where
|
||||
|
||||
import qualified Data.ByteString as B
|
||||
import Types
|
||||
|
||||
substr :: Int -> Int -> BS -> BS
|
||||
substr b l = B.take l . B.drop b
|
114
src/Tokenizers.hs
Normal file
114
src/Tokenizers.hs
Normal file
|
@ -0,0 +1,114 @@
|
|||
{-# LANGUAGE Rank2Types #-}
|
||||
{-# LANGUAGE TupleSections #-}
|
||||
|
||||
module Tokenizers (TokOpts, tokOptions, tokenize) where
|
||||
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.UTF8 as BU8
|
||||
import qualified Data.Word8 as W8
|
||||
import Options.Applicative
|
||||
import Types
|
||||
import qualified Unicode.Char.General as UC
|
||||
|
||||
data TokOpts =
|
||||
TokOpts
|
||||
{ optTokenizerName :: String
|
||||
}
|
||||
deriving (Show)
|
||||
|
||||
tokOptions :: Parser TokOpts
|
||||
tokOptions =
|
||||
TokOpts <$>
|
||||
strOption
|
||||
(metavar "LEXER" <>
|
||||
short 'l' <>
|
||||
long "lexer" <>
|
||||
value "text" <>
|
||||
help "Lexer name (use --help-lexers to list available ones)")
|
||||
|
||||
--TODO this should later allow choosing the lexer by suffix
|
||||
tokenize :: MonadFail m => TokOpts -> FilePath -> BS -> m [Tok]
|
||||
tokenize topt _ =
|
||||
case filter ((== optTokenizerName topt) . tkName) tokenizers of
|
||||
[t] -> runTokenizer t
|
||||
_ -> const $ fail "Lexer not found"
|
||||
|
||||
data Tokenizer =
|
||||
Tokenizer
|
||||
{ tkName :: String
|
||||
, tkDescription :: String
|
||||
, runTokenizer :: forall m. MonadFail m =>
|
||||
BS -> m [Tok]
|
||||
}
|
||||
|
||||
tokenizers :: [Tokenizer]
|
||||
tokenizers =
|
||||
[ Tokenizer "lines" "works like the traditional diff" tokenizeLines
|
||||
, Tokenizer "asciiwords" "separate by ASCII whitespace" tokenizeWords8
|
||||
, Tokenizer "words" "separate on any UTF8 spacing" tokenizeWords
|
||||
, Tokenizer
|
||||
"text"
|
||||
"separate groups of similar-class UTF8 characters"
|
||||
tokenizeUnicode
|
||||
]
|
||||
|
||||
tokenizeLines :: MonadFail m => BS -> m [Tok]
|
||||
tokenizeLines = pure . map (True, ) . BU8.lines'
|
||||
|
||||
tokenizeWords8 :: MonadFail m => BS -> m [Tok]
|
||||
tokenizeWords8 =
|
||||
pure . map (\x -> (not $ W8.isSpace $ B.head x, x)) . bsGroupOn W8.isSpace
|
||||
where
|
||||
bsGroupOn f = B.groupBy (\l r -> f l == f r)
|
||||
|
||||
tokenizeWords :: MonadFail m => BS -> m [Tok]
|
||||
tokenizeWords = pure . map makeTokenBU8 . groupOnBU8 UC.isWhiteSpace
|
||||
|
||||
makeTokenBU8 :: BS -> Tok
|
||||
makeTokenBU8 s = if
|
||||
maybe False (not . UC.isWhiteSpace . fst) (BU8.decode s)
|
||||
then (True, s)
|
||||
else (False, s)
|
||||
|
||||
{- needed?
|
||||
foldri :: (Char -> Int -> a -> a) -> a -> BS -> a
|
||||
foldri cons nil cs =
|
||||
case decode cs of
|
||||
Just (a, l) -> cons a l (foldrS cons nil $ drop l cs)
|
||||
Nothing -> nil
|
||||
-}
|
||||
|
||||
spanBU8 :: (Char -> Bool) -> BS -> (BS, BS)
|
||||
spanBU8 f orig = B.splitAt (go 0 orig) orig
|
||||
where
|
||||
go :: Int -> BS -> Int
|
||||
go n bs =
|
||||
case BU8.decode bs of
|
||||
Just (a, l) ->
|
||||
if f a
|
||||
then go (n + l) (B.drop l bs)
|
||||
else n
|
||||
Nothing -> n
|
||||
|
||||
groupByBU8 :: (Char -> Char -> Bool) -> BS -> [BS]
|
||||
groupByBU8 f s =
|
||||
case BU8.decode s of
|
||||
Just (a, _) ->
|
||||
let (g, s') = spanBU8 (f a) s
|
||||
in g : groupByBU8 f s'
|
||||
Nothing -> []
|
||||
|
||||
groupOnBU8 :: Eq a => (Char -> a) -> BS -> [BS]
|
||||
groupOnBU8 f = groupByBU8 (\l r -> f l == f r)
|
||||
|
||||
tokenizeUnicode :: MonadFail m => BS -> m [Tok]
|
||||
tokenizeUnicode = pure . map makeTokenBU8 . groupOnBU8 simpleUnicodeCategory
|
||||
|
||||
simpleUnicodeCategory :: Char -> Int
|
||||
simpleUnicodeCategory c
|
||||
| UC.isWhiteSpace c = 1
|
||||
| UC.isAlphaNum c = 2
|
||||
| UC.isSymbol c = 3
|
||||
| UC.isPunctuation c = 4
|
||||
| UC.isSeparator c = 5
|
||||
| otherwise = 0
|
|
@ -1,3 +1,5 @@
|
|||
--{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Types where
|
||||
|
||||
import Data.ByteString
|
||||
|
|
Loading…
Reference in a new issue