Compare commits

...

10 commits

14 changed files with 934 additions and 357 deletions

170
README.md Normal file
View file

@ -0,0 +1,170 @@
# adiff (arbitrary-tokens diff, patch and merge)
This is a half-working pre-alpha version, use with care.
### Short summary
The main aim of this toolbox is to help with finding differences in text
formats that do not have a fixed "line-by-line" semantics, as assumed by
standard unix `diff` and related tools.
The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who
produced a working software package in C++, and designed the Regex-edged DFAs
(REDFAs) that were used for user-specifiable tokenization of the input. The
work on the corresponding thesis is finished.
This started as a simple Haskell port of that work, and packed some relatively
orthogonal improvements (mainly the histogram-style diffing). I later got rid
of the REDFA concept -- while super-interesting and useful in theory, I didn't
find a sufficiently universal way to build good lexers from user-specified
strings. Having a proper Regex representation library (so that e.g.
reconstructing Flex is easy) would help a lot.
### TODO list
- Implement `patch` functionality, mainly patchfile parsing and fuzzy matching
of hunk context. `diff` and `diff3` works.
- Implement a splitting heuristic for diffs, so that diffing of large files
doesn't take aeons
- check if we can have external lexers, unix-style
# How-To
Install using `cabal`. The `adiff` program has 3 sub-commands that work like
`diff`, `patch` and `diff3`.
## Example
Let's have a file `orig`:
```
Roses are red. Violets are blue.
Patch is quite hard. I cannot rhyme.
```
and a modified file `mine`:
```
Roses are red. Violets are blue.
Patching is hard. I still cannot rhyme.
```
Let's use the `words` lexer, which marks everything whitespace-ish as
whitespace, and picks up groups of non-whitespace "content" characters.
Diffing the 2 files gets done as such:
```
$ cabal run adiff -- -l words diff orig mine
```
You should get something like this:
```
@@ -7 +7 @@
.
|are
.
|blue.
.\n
-|Patch
+|Patching
.
|is
-.
-|quite
.
|hard.
.
|I
+.
+|still
.
|cannot
.
|rhyme.
.\n
```
Let's pretend someone has sent us a new version, with a better formated verse
and some other improvements, in file `yours`:
```
Roses are red.
Violets are blue.
Patch is quite hard.
I cannot do verses.
```
We can run `diff3` to get a patch with both changes, optionally with reduced
context:
```
$ cabal run adiff -- -l words diff3 mine orig yours -C1
```
...which outputs:
```
@@ -4 +4 @@
|red.
-.
+.\n
|Violets
@@ -11 +11 @@
.\n
-|Patch
+|Patching
.
|is
-.
-|quite
.
|hard.
-.
+.\n
|I
+.
+|still
.
@@ -23 +23 @@
.
-|rhyme.
+|do
+.
+|verses.
.\n
```
...or get a merged output right away, using the `-m`/`--merge` option:
```
Roses are red.
Violets are blue.
Patching is hard.
I still cannot do verses.
```
...or completely ignore whatever whitespace changes that the people decided to
do for whatever reason, with `-i`/`--ignore-whitespace` (also works without
`-m`):
```
Roses are red. Violets are blue.
Patching is hard. I still cannot do verses.
```
If there's a conflict (substituing the `Patch` to `Merging` in file `yours`), it gets highlighted in the merged diff as such:
```
[...]
.
|blue.
.\n
<|Patching
=|Patch
>|Merging
.
|is
-.
-|quite
[...]
```
and using the standard conflict marks in the merged output:
```
Roses are red.
Violets are blue.
<<<<<<<Patching|||||||Patch=======Merging>>>>>>> is hard.
I still cannot do verses.
```

View file

@ -42,6 +42,8 @@ executable adiff
-- .hs or .lhs file containing the Main module.
main-is: Main.hs
ghc-options: -O2 -Wall
-- Modules included in this executable, other than Main.
other-modules: Diff,
Diff3,
@ -49,8 +51,7 @@ executable adiff
Hunks,
Merge,
Patch,
Redfa,
Substr,
Tokenizers,
Types,
Version
@ -58,13 +59,15 @@ executable adiff
other-extensions: CPP
-- Other library packages from which modules are imported.
build-depends: base ^>=4.13.0.0,
build-depends: base ^>=4.15.0.0,
attoparsec ^>=0.14,
extra ^>= 1.7,
mmap ^>=0.5,
regex-tdfa ^>= 1.3,
optparse-applicative ^>=0.16,
bytestring ^>= 0.10.12,
bytestring ^>= 0.11.2,
vector ^>=0.12,
word8 ^>=0.1,
unicode-data ^>=0.3,
utf8-string ^>=1.0
-- Directories containing source files.

View file

@ -4,18 +4,8 @@ module Diff
( diffToks
) where
import Control.Monad
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import Data.ByteString.UTF8 (fromString)
import Data.Function (on)
import Data.List (groupBy, mapAccumL)
import Data.List.Extra (split, takeEnd)
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed.Mutable as M
import Substr
import Types
data DiffEnv =
@ -30,10 +20,11 @@ data DiffEnv =
, deB :: Int
, deVS :: V.Vector (Int, Int)
, deVE :: V.Vector (Int, Int)
, deTokPrio :: Tok -> Int
, deTrans :: Bool
}
deriving (Show)
toksMatch :: Int -> Int -> DiffEnv -> Bool
toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y
stripEqToks :: TV -> TV -> (Diff, Diff, TV, TV)
@ -56,10 +47,19 @@ stripEqToks t1 t2 = (pre, post, t1', t2')
t1' = V.slice b (l1 - e - b) t1
t2' = V.slice b (l2 - e - b) t2
makePrios :: TV -> TV -> (Bool, BS) -> Int
makePrios _ _ = get
where
get (isToken, str) =
if isToken
then B.length str
else 0
diffToks :: TV -> TV -> Diff
diffToks t1' t2' = pre ++ res ++ post
where
(pre, post, t1, t2) = stripEqToks t1' t2'
stats = makePrios t1' t2'
res
| V.null t1 = map (Add, ) (V.toList t2)
| V.null t2 = map (Remove, ) (V.toList t1)
@ -76,6 +76,7 @@ diffToks t1' t2' = pre ++ res ++ post
, deB = V.length t2
, deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0)
, deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0)
, deTokPrio = stats
, deTrans = False
}
| otherwise =
@ -91,9 +92,11 @@ diffToks t1' t2' = pre ++ res ++ post
, deB = V.length t1
, deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0)
, deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0)
, deTokPrio = stats
, deTrans = True
}
minIndexFwd :: V.Vector (Int, Int) -> Int
minIndexFwd =
V.minIndexBy
(\x y ->
@ -102,6 +105,7 @@ minIndexFwd =
else GT --basically normal V.minIndex
)
minIndexRev :: V.Vector (Int, Int) -> Int
minIndexRev =
V.minIndexBy
(\x y ->
@ -117,12 +121,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
mid = quot (s + e) 2
vecSmid = vecS mid
vecEmid = vecE mid
extraScore i =
if isToken
then -(B.length s)
else 0
where
(isToken, s) = deT1 de V.! i
prio i = negate . deTokPrio de $ deT1 de V.! i
vecS = vec -- "forward" operation
where
vec i
@ -138,7 +137,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
(iupleft, supleft) = v V.! pred j
keep
| toksMatch (pred i) (pred j) de =
min (iupleft, supleft + extraScore (pred i))
min (iupleft, supleft + prio (pred i))
| otherwise = id
res = keep $ min (succ iup, sup) (succ ileft, sleft)
in res : go (succ j) res
@ -157,7 +156,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
(idownright, sdownright) = v V.! succ j
keep
| toksMatch i j de =
min (idownright, sdownright + extraScore i)
min (idownright, sdownright + prio i)
| otherwise = id
res = keep $ min (succ idown, sdown) (succ iright, sright)
in res : go (pred j) res
@ -220,7 +219,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
, fst (vecLS V.! i) == fst sCost - a + i
, sumL V.! i == totalCost
, if doKeep
then scoreAdd (vecLS V.! i) (0, extraScore s) ==
then scoreAdd (vecLS V.! i) (0, prio s) ==
vecRS V.! succ i
else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i
, if doKeep
@ -230,7 +229,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
]
jumpEnd =
if doKeep
then jumpPos + 1
then succ jumpPos
else jumpPos
in map
(\i ->

View file

@ -1,56 +1,47 @@
{-# LANGUAGE TupleSections #-}
module Diff3 where
module Diff3
( diff3Toks
) where
import Diff
import Types
import Merge
import Types
data Origin
= Stable
| Mine
| Your
deriving (Show, Eq)
stable :: (Origin, a) -> Bool
stable (Stable, _) = True
stable _ = False
applySplit :: (a -> Bool) -> ([a] -> [b]) -> ([a] -> [b]) -> [a] -> [b]
applySplit cond f g xs = f (takeWhile cond xs) ++ g (dropWhile cond xs)
conflict :: MergeOpts -> [(Origin, (Op, Tok))] -> [(Op, Tok)]
conflict mo = go
where
go [] = []
go as@(a:_)
| stable a = applySplit stable (map snd) go as
| otherwise = applySplit (not . stable) (merge mo) go as
align :: Diff -> Diff -> [(Origin, (Op, Tok))]
align ((Keep, a):as) ((Keep, _):bs) = (Stable, (Keep, a)) : align as bs
align ((Add, a):as) ((Add, b):bs) =
(Mine, (Add, a)) : (Your, (Add, b)) : align as bs
align ((Remove, a):as) ((Remove, b):bs) =
(Mine, (Remove, a)) : (Your, (Remove, b)) : align as bs
align ((Add, a):as) bs@((Keep, _):_) = (Mine, (Add, a)) : align as bs
align as@((Keep, _):_) ((Add, b):bs) = (Your, (Add, b)) : align as bs
align ((Remove, a):as) ((Keep, b):bs) =
(Mine, (Remove, a)) : (Your, (Keep, b)) : align as bs
align ((Keep, a):as) ((Remove, b):bs) =
(Mine, (Keep, a)) : (Your, (Remove, b)) : align as bs
align ((Add, a):as) bs@((Remove, _):_) = (Mine, (Add, a)) : align as bs
align as@((Remove, _):_) ((Add, b):bs) = (Your, (Add, b)) : align as bs
align [] [] = []
align as@((Add, _):_) [] = map (Mine, ) as
align [] bs@((Add, _):_) = map (Your, ) bs
align _ _ = error "Internal failure: diffstreams seem broken, cannot align"
diff3Toks :: MergeOpts -> TV -> TV -> TV -> Diff
diff3Toks mo tMine tOrig tYour =
conflict $ align (diffToks tOrig tMine) (diffToks tOrig tYour)
where
align :: Diff -> Diff -> [(Origin, (Op, Tok))]
align ((Keep, a):as) ((Keep, _):bs) = (Stable, (Keep, a)) : align as bs
align ((Add, a):as) ((Add, b):bs) =
(Mine, (Add, a)) : (Your, (Add, b)) : align as bs
align ((Remove, a):as) ((Remove, b):bs) =
(Mine, (Remove, a)) : (Your, (Remove, b)) : align as bs
align ((Add, a):as) bs@((Keep, _):_) = (Mine, (Add, a)) : align as bs
align as@((Keep, _):_) ((Add, b):bs) = (Your, (Add, b)) : align as bs
align ((Remove, a):as) ((Keep, b):bs) =
(Mine, (Remove, a)) : (Your, (Keep, b)) : align as bs
align ((Keep, a):as) ((Remove, b):bs) =
(Mine, (Keep, a)) : (Your, (Remove, b)) : align as bs
align ((Add, a):as) bs@((Remove, _):_) = (Mine, (Add, a)) : align as bs
align as@((Remove, _):_) ((Add, b):bs) = (Your, (Add, b)) : align as bs
align [] [] = []
align as@((Add, _):_) [] = map (Mine,) as
align [] bs@((Add, _):_) = map (Your,) bs
align _ _ = error "Internal failure: diffstreams seem broken, cannot align"
conflict :: [(Origin, (Op, Tok))] -> Diff
conflict [] = []
conflict as@(a:_)
| stable a = applySplit stable (map snd) conflict as
| unstable a = applySplit unstable merge conflict as
applySplit :: (a -> Bool) -> ([a] -> [b]) -> ([a] -> [b]) -> [a] -> [b]
applySplit cond f g xs = f (takeWhile cond xs) ++ g (dropWhile cond xs)
merge :: [(Origin, (Op,Tok))] -> Diff
merge cs =
let mys = map (\a -> map snd $ filter ((== a) . fst) cs) [Mine, Your]
[tokOrigsMine, tokOrigsYour] =
map (map snd.filter ((/= Add) . fst)) mys
[tokMine, tokYour] = map (map snd.filter ((/= Remove) . fst)) mys
in if tokOrigsMine /= tokOrigsYour
then error "Internal failure: merge origins differ"
else map (MineChanged,) tokMine ++
map (Original,) tokOrigsMine ++
map (YourChanged,) tokYour
stable (Stable, _) = True
stable _ = False
unstable = not . stable
conflict mo $ align (diffToks tOrig tMine) (diffToks tOrig tYour)

View file

@ -1,43 +1,128 @@
{-# LANGUAGE OverloadedStrings #-}
module Format where
module Format
( pprHunks
, pprHunk
, parsePatch
) where
import Types
import Control.Applicative
import Control.Monad
import Data.Attoparsec.ByteString as A
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import Data.String
import Substr
import Data.Word8 as W8
pprHunkHdr i j = fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@"
backslash :: Word8
backslash = BI.c2w '\\'
lineSep = fromString "\n"
newline :: Word8
newline = BI.c2w '\n'
pprHunks :: [Hunk] -> BB.Builder
pprHunks = mconcat . map pprHunk
lineSep :: BB.Builder
lineSep = BB.word8 newline
pprHunkHdr :: Int -> Int -> BB.Builder
pprHunkHdr i j =
(fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@") <> lineSep
pprHunk :: Hunk -> BB.Builder
pprHunk ((i, j), toks) = mconcat ((pprHunkHdr i j <> lineSep) : map pprDiff1 toks)
pprHunk ((i, j), toks) = mconcat (pprHunkHdr i j : map pprDiff1 toks)
pprDiff1 :: (Op, Tok) -> BB.Builder
pprDiff1 (op, (tok, s)) =
fromString pfx <> escNewlines s <> lineSep
pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
where
pfx = [opc,tc]
opc = case op of
Add -> '+'
Keep -> ' '
Remove -> '-'
MineChanged -> '<'
Original -> '='
YourChanged -> '>'
tc = if tok then '|' else '.'
pfx = [opc, tc]
opc =
case op of
Add -> '+'
Keep -> ' '
Remove -> '-'
MineChanged -> '<'
Original -> '='
YourChanged -> '>'
tc =
if tok
then '|'
else '.'
escNewlines :: BS -> BB.Builder
escNewlines s
| B.null s = mempty
| B.head s == BI.c2w '\n' = BB.stringUtf8 "\\n" <> escNewlines (B.tail s)
| B.head s == BI.c2w '\\' = BB.stringUtf8 "\\\\" <> escNewlines (B.tail s)
| B.head s == newline =
BB.word8 backslash <> BB.word8 (BI.c2w 'n') <> escNewlines (B.tail s)
| B.head s == backslash =
BB.word8 backslash <> BB.word8 backslash <> escNewlines (B.tail s)
| otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s)
--parsePatch :: BS -> Either String [Hunk]
parsePatch = parseOnly parseHunks
parseHunks :: Parser [Hunk]
parseHunks = many parseHunk <* endOfInput
parseHunk :: Parser Hunk
parseHunk = liftA2 (,) parseHunkHdr (many parseDiff1)
parseInt :: Parser Int
parseInt = read . map BI.w2c <$> some (satisfy W8.isDigit)
eol :: Parser ()
eol = void $ word8 newline
parseHunkHdr :: Parser (Int, Int)
parseHunkHdr = do
void . string $ fromString "@@ -"
i <- parseInt
void . string $ fromString " +"
j <- parseInt
void . string $ fromString " @@"
eol
return (i, j)
pairs2parsers :: [(a, Char)] -> [Parser a]
pairs2parsers = map (\(x, ch) -> x <$ word8 (BI.c2w ch))
parseOpList :: [Parser Op]
parseOpList =
pairs2parsers
[ (Add, '+')
, (Keep, ' ')
, (Remove, '-')
, (MineChanged, '<')
, (Original, '=')
, (YourChanged, '>')
]
parseOp :: Parser Op
parseOp = choice parseOpList
parseTokMarkList :: [Parser Bool]
parseTokMarkList = pairs2parsers [(True, '|'), (False, '.')]
parseTokMark :: Parser Bool
parseTokMark = choice parseTokMarkList
parseTokBS :: Parser BS
parseTokBS =
(BL.toStrict . BB.toLazyByteString . mconcat <$> many parseTokChar) <* eol
parseTokChar :: Parser BB.Builder
parseTokChar =
choice
[ BB.word8 newline <$ string (fromString "\\n")
, BB.word8 backslash <$ string (fromString "\\\\")
, BB.word8 <$> satisfy (\w -> w /= backslash && w /= newline)
]
parseTok :: Parser Tok
parseTok = liftA2 (,) parseTokMark parseTokBS
parseDiff1 :: Parser (Op, Tok)
parseDiff1 = liftA2 (,) parseOp parseTok

View file

@ -21,7 +21,7 @@ hunks ctxt d =
zipWith (<=) (padEnd ctxt edits) (replicate (1 + 2 * ctxt) 0 ++ edits)
addNums = snd . mapAccumL countTok (0, 0)
stripNums = (,) <$> fst . head <*> map snd
countTok x@(i, j) d@(op, _) =
countTok x@(i, j) d'@(op, _) =
(,)
(case op of
Remove -> (i + 1, j)
@ -30,4 +30,4 @@ hunks ctxt d =
MineChanged -> (i, j)
Original -> (i + 1, j + 1)
YourChanged -> (i, j))
(x, d)
(x, d')

View file

@ -1,9 +1,10 @@
module Main where
import Control.Exception
import Control.Monad
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Char8 as B8
import Data.ByteString.UTF8 (fromString)
import Data.Foldable (traverse_)
import qualified Data.Vector as V
import Diff
import Diff3
@ -11,14 +12,17 @@ import Format
import Hunks
import Merge
import Options.Applicative
import Redfa
import System.IO (stdout)
import Patch
import System.Exit
import System.IO (hPutStrLn, stderr, stdout)
import System.IO.MMap
import Tokenizers
import Types
import Version
data ADiffOptions =
ADiffOptions
{ adiffRedfaOpt :: RedfaOption
{ adiffTokOpts :: TokOpts
, adiffCmdOpts :: ADiffCommandOpts
}
deriving (Show)
@ -31,11 +35,17 @@ data ADiffCommandOpts
}
| CmdPatch
{ patchDryRun :: Bool
, patchInDir :: Maybe String
, patchInput :: String
--, patchInDir :: Maybe String
--, patchPathStrip :: Int
, patchInputPatch :: String
, patchOutput :: String
, patchReject :: String --todo convert to Maybes with optional
, patchBackup :: String
, patchReverse :: Bool
, patchPathStrip :: Int
, patchScanRange :: Int
, context :: Int
, patchMergeOpts :: MergeOpts
, patchInput :: String
}
| CmdDiff3
{ context :: Int
@ -46,50 +56,90 @@ data ADiffCommandOpts
}
deriving (Show)
contextOpt =
contextOpt :: Bool -> Parser Int
contextOpt forPatch =
check <$>
option
auto
(metavar "CONTEXT" <>
short 'C' <>
long "context" <>
value 5 <> help "How many tokens around the change to include in the patch")
value
(if forPatch
then 4
else 8) <>
help
(if forPatch
then "Maximum number of context tokens that may be discarded from the beginning and end of the hunk when attempting to find a match"
else "How many tokens around the change to include in the patch"))
where
check c
| c < 0 = error "Negative context"
| otherwise = c
diffCmdOptions :: Parser ADiffCommandOpts
diffCmdOptions =
CmdDiff <$> contextOpt <*> strArgument (metavar "FROMFILE") <*>
CmdDiff <$> contextOpt False <*> strArgument (metavar "FROMFILE") <*>
strArgument (metavar "TOFILE")
patchCmdOptions :: Parser ADiffCommandOpts
patchCmdOptions =
CmdPatch <$>
switch
(short 'n' <>
long "dry-run" <>
help "Do not patch anything, just print what would happen") <*>
optional
(strOption $
short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
-- optional (strOption $ short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
-- option auto (short 'p' <> long "strip" <> metavar "NUM" <> help "Strip NUM leading components from the paths" <> value 0) <*>
strOption
(short 'i' <>
long "input" <>
metavar "INPUT" <>
help "Read the patchfile from INPUT, defaults to `-' for STDIN" <>
metavar "PATCHFILE" <>
help "Read the patchfile from PATCHFILE, defaults to `-' for STDIN" <>
value "-") <*>
strOption
(short 'o' <>
long "output" <>
metavar "OUTPUT" <>
help
"Write the patched file to OUTPUT, use `-' for STDOUT. By default, INPUT is rewritten." <>
value "") <*>
strOption
(short 'r' <>
long "reject" <>
metavar "REJECTS" <>
help
"Write the rejected hunks file to file REJECTS, instead of default `OUTPUT.rej'. Use `-' to discard rejects." <>
value "") <*>
strOption
(short 'b' <>
long "backup" <>
metavar "BACKUP" <>
help
"When rewriting INPUT after a partially applied or otherwise suspicious patch, back up the original file in BACKUP instead of default `INPUT.orig'. Use `-' to discard backups." <>
value "") <*>
switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*>
option
auto
(short 'p' <>
long "strip" <>
metavar "NUM" <>
help "Strip NUM leading components from the paths" <> value 0) <*>
mergeOption True
(short 'S' <>
long "scan-range" <>
metavar "RANGE" <>
help
"Maximum distance from the indended patch position (in tokens) for fuzzy matching of hunks" <>
value 42) <*>
contextOpt True <*>
mergeOption True <*>
strArgument (metavar "INPUT")
diff3CmdOptions :: Parser ADiffCommandOpts
diff3CmdOptions =
CmdDiff3 <$> contextOpt <*> strArgument (metavar "MYFILE") <*>
CmdDiff3 <$> contextOpt False <*> strArgument (metavar "MYFILE") <*>
strArgument (metavar "OLDFILE") <*>
strArgument (metavar "YOURFILE") <*>
mergeOption False
actionOption :: Parser ADiffCommandOpts
actionOption =
actionOptions :: Parser ADiffCommandOpts
actionOptions =
hsubparser $
mconcat
[ command "diff" $ info diffCmdOptions $ progDesc "Compare two files"
@ -98,35 +148,138 @@ actionOption =
info diff3CmdOptions $ progDesc "Compare and merge three files"
]
adiffOptions = ADiffOptions <$> redfaOption <*> actionOption
adiffOptions :: Parser ADiffOptions
adiffOptions = ADiffOptions <$> tokOptions <*> actionOptions
loadToks redfa f =
V.fromList <$> (mmapFileByteString f Nothing >>= redfaTokenize redfa)
-- TODO: load in case it's not a regular file
loadToksMM :: TokOpts -> FilePath -> IO TV
loadToksMM topt fn = loadToksWith topt fn (mmapFileByteString fn Nothing)
main :: IO ()
main =
loadToksR :: TokOpts -> FilePath -> IO TV
loadToksR topt fn = loadToksWith topt fn (B.readFile fn)
loadToksWith :: TokOpts -> FilePath -> IO BS -> IO TV
loadToksWith topt fn bs = V.fromList <$> (bs >>= tokenize topt fn)
doDiff :: TokOpts -> ADiffCommandOpts -> IO ()
doDiff topt (CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt}) = do
[toks1, toks2] <- traverse (loadToksMM topt) [f1, f2]
let output = hunks (max 0 ctxt) $ diffToks toks1 toks2
BB.hPutBuilder stdout $ pprHunks output
unless (null output) $ exitWith (ExitFailure 1)
doDiff _ _ = error "dispatch failure"
doDiff3 :: TokOpts -> ADiffCommandOpts -> IO ()
doDiff3 topt (CmdDiff3 ctxt f1 f2 f3 mo) = do
[toksMine, toksOld, toksYour] <- traverse (loadToksMM topt) [f1, f2, f3]
let d3 = diff3Toks mo toksMine toksOld toksYour
isConflict (MineChanged, _) = True
isConflict (YourChanged, _) = True
isConflict _ = False
hasConflict = any isConflict d3
BB.hPutBuilder stdout $
if mergeDoMerge mo
then do
fmtMerged mo d3
else pprHunks $ hunks (max 0 ctxt) d3
when hasConflict $ exitWith (ExitFailure 1)
doDiff3 _ _ = error "dispatch failure"
note :: String -> IO ()
note = hPutStrLn stderr
doPatch :: TokOpts -> ADiffCommandOpts -> IO ()
doPatch topt o = do
toksIn <- loadToksR topt (patchInput o)
hs' <-
parsePatch <$>
case (patchInputPatch o) of
"-" -> B.getContents
fn -> B.readFile fn
hs <-
case hs' of
Left _ -> ioError $ userError "PATCHFILE parsing failed"
Right x -> pure x
let (toks, rej, warns) =
patchToks
toksIn
hs
(patchReverse o)
(patchScanRange o)
(context o)
(patchMergeOpts o)
sus = not (null warns)
dry = patchDryRun o
rewritingInput = null (patchOutput o)
outputStdout = patchOutput o == "-"
rejFile
| rewritingInput || outputStdout = patchInput o ++ ".rej"
| otherwise = patchOutput o ++ ".rej"
backupFile
| patchBackup o == "-" = ""
| patchBackup o == "" && rewritingInput = patchInput o ++ ".orig"
| otherwise = patchBackup o
outFile
| rewritingInput = patchInput o
| otherwise = patchOutput o
traverse_ (note . pprPatchWarn) warns
when dry $
note $
(if not sus
then "OK"
else "Possibly problematic") ++
" patch with " ++ show (length rej :: Int) ++ " rejected hunks"
when (not (null rej)) $
if dry
then note $ "Would write rejected hunks to " ++ rejFile
else do
note $ "Writing rejected hunks to " ++ rejFile
BB.writeFile rejFile (pprHunks rej)
when (sus && not (null backupFile)) $
if dry
then note $ "Would write backup to " ++ backupFile
else do
note $ "Writing backup to " ++ backupFile
B.readFile (patchInput o) >>= B.writeFile backupFile
let doWrite output =
if outputStdout
then if dry
then note "Would write output to stdout"
else BB.hPutBuilder stdout output
else if dry
then note $ "Would write output to " ++ outFile
else do
note $ "Writing output to " ++ outFile
BB.writeFile outFile output
doWrite (mconcat . map (BB.byteString . snd) . V.toList $ toks)
when (dry && not (null rej)) $ do
note "Rejected hunks:"
BB.hPutBuilder stdout (pprHunks rej)
when sus $ exitWith (ExitFailure 1)
main' :: IO ()
main' =
let opts :: ParserInfo ADiffOptions
opts =
info
(adiffOptions <**> versionOption "adiff" <**> helperOption)
(fullDesc <>
progDesc
"Compare, patch and merge files on arbitrarily-tokenized sequences." <>
"Compare, patch and merge files on arbitrarily tokenized sequences." <>
header "adiff: arbitrary-token diff utilities")
in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
execParser opts
redfa <- redfaPrepareRules ropt
case copt of
CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
[toks1, toks2] <- traverse (loadToks redfa) [f1, f2]
BB.hPutBuilder stdout $
pprHunks $ hunks (max 0 ctxt) $ diffToks toks1 toks2
CmdPatch {} -> putStrLn "not supported yet"
CmdDiff3 ctxt f1 f2 f3 mo -> do
[toksMine, toksOld, toksYour] <-
traverse (loadToks redfa) [f1, f2, f3]
let d3 = diff3Toks mo toksMine toksOld toksYour
BB.hPutBuilder stdout $
if mergeDoMerge mo
then fmtMerged mo d3
else pprHunks $ hunks (max 0 ctxt) d3
in do ADiffOptions {adiffTokOpts = topt, adiffCmdOpts = copt} <-
customExecParser (prefs $ helpShowGlobals <> subparserInline) opts
(case copt of
CmdDiff {} -> doDiff
CmdDiff3 {} -> doDiff3
CmdPatch {} -> doPatch)
topt
copt
main :: IO ()
main =
main' `catch`
(\e -> do
let err = show (e :: IOException)
note err
exitWith $ ExitFailure 2)

View file

@ -1,12 +1,13 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module Merge
( MergeOpts(..)
, mergeOption
, fmtMerged
, merge
) where
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import Data.String
import Options.Applicative
@ -15,6 +16,7 @@ import Types
data MergeOpts =
MergeOpts
{ mergeDoMerge :: Bool
, mergeIgnoreWhitespace :: Bool
, mergeForceWhitespace :: Bool
, mergeKeepWhitespace :: Bool
, mergeCStartStr :: BS
@ -24,17 +26,17 @@ data MergeOpts =
}
deriving (Show)
marker = fromString . replicate 7
mergeOption :: Bool -> Parser MergeOpts
mergeOption forPatch =
addLBR <$>
((,) <$>
switch
(short 'a' <>
long "add-linebreak" <>
help "Automatically add a line break after conflict markers") <*>
help "Automatically add a line break after conflict markers. Useful with `lines' lexer.") <*>
mo)
where
marker = fromString . replicate 7
mo =
MergeOpts <$>
switch
@ -42,27 +44,36 @@ mergeOption forPatch =
long "merge" <>
help
(if forPatch
then "Merge using conflict markers instead of printing the rejected hunks"
then "Instead of printing the rejected thunks, merge using conflict markers as if the INPUT was `MYFILE' and the patch would produced `YOURFILE' from the original."
else "Output the merged file instead of the patch")) <*>
switch
(short 'w' <>
long "whitespace" <>
(short 'I' <>
long "ignore-whitespace" <>
help
("Ignore " ++
(if forPatch
then "hunks"
else "chunks") ++
" that change only whitespace")) <*>
switch
(short 'F' <>
long "force-whitespace" <>
help
((if forPatch
then "Force rejecting a thunk"
then "Force rejecting a hunk"
else "Force a merge conflict") ++
" on whitespace mismatch")) <*>
" on whitespace mismatch (overrides `ignore-whitespace')")) <*>
switch
(short 'k' <>
(short 'K' <>
long "keep-whitespace" <>
help
("On whitespace mismatch, default to the version from " ++
("On whitespace mismatch, output the version from " ++
(if forPatch
then "original file"
then "the original file"
else "MYFILE") ++
" instead of the one from " ++
(if forPatch
then "patch"
then "the context in patch"
else "YOURFILE"))) <*>
strOption
(long "merge-start" <>
@ -89,24 +100,27 @@ mergeOption forPatch =
, mergeCEndStr = mergeCEndStr x <> "\n"
}
{- This kinda relies on reasonable ordering within the conflicts in the Diff -}
{- This kinda relies on reasonable ordering
- within the conflicts in the Diff -}
fmtMerged :: MergeOpts -> Diff -> BB.Builder
fmtMerged mo = go Keep
where
go op []
| conflictOp op = bb $ mergeCEndStr mo
| otherwise = mempty
go last l@((op, (_, tok)):xs)
| conflictOp last && not (conflictOp op) =
go prev l@((op, (_, tok)):xs)
| conflictOp prev && not (conflictOp op) =
bb (mergeCEndStr mo) <> go Keep l
| not (conflictOp last) && conflictOp op =
| not (conflictOp prev) && conflictOp op =
bb (mergeCStartStr mo) <> go MineChanged l
| last /= op && conflictOp op =
| prev /= op && conflictOp op =
(case op of
MineChanged -> bb $ mergeCStartStr mo
Original -> bb $ mergeMineSepStr mo
YourChanged -> bb $ mergeYourSepStr mo) <>
YourChanged -> bb $ mergeYourSepStr mo
_ -> error "Internal conflict handling failure") <>
go op l
| op == Remove = go op xs
| otherwise = bb tok <> go op xs
conflictOp o =
case o of
@ -115,3 +129,27 @@ fmtMerged mo = go Keep
Remove -> False
_ -> True
bb = BB.byteString
merge :: MergeOpts -> [(Origin, (Op, Tok))] -> Diff
merge mo cs = go
where
mys@[diffMine, diffYour] =
map (\a -> map snd $ filter ((a ==) . fst) cs) [Mine, Your]
[tokOrigsMine, tokOrigsYour] = map (map snd . filter ((Add /=) . fst)) mys
[tokMine, tokYour] = map (map snd . filter ((Remove /=) . fst)) mys
conflict =
map (MineChanged, ) tokMine ++
map (Original, ) tokOrigsMine ++ map (YourChanged, ) tokYour
noTokens = all (not . fst . snd) (diffMine ++ diffYour)
go
| tokOrigsMine /= tokOrigsYour =
error "Internal failure: merge origins differ"
| mergeIgnoreWhitespace mo && noTokens = map (Keep, ) tokOrigsMine
| all ((Keep ==) . fst) diffYour = diffMine -- only mine changed
| all ((Keep ==) . fst) diffMine = diffYour -- only your changed
| diffMine == diffYour = diffMine -- false conflict
| not (mergeForceWhitespace mo) && noTokens =
if mergeKeepWhitespace mo
then diffMine
else diffYour -- conflict happened, but not on significant tokens
| otherwise = conflict -- true conflict

View file

@ -1,4 +1,210 @@
module Patch where
module Patch
( patchToks
, pprPatchWarn
) where
patchToks :: a
patchToks = undefined
import Data.Maybe (catMaybes)
import qualified Data.Vector as V
import Merge
import Types
data PatchWarn
= HunkMatched Hunk (Int, Int)
| HunkFailed Hunk (Int, Int)
deriving (Show)
data PatchState =
PatchState
{ input :: TV
, output :: [TV]
, warns :: [PatchWarn]
, inOff :: Int -- where we are in `input`
, patchInOff :: Int -- to what position does that correspond in patch input (gets adjusted on fuzzy matches)
, outOff :: Int -- where we are in output tokens (informative)
, patchOutOff :: Int -- to what position does that correspond in patch output (gets adjusted from processed hunks)
}
pprPatchWarn :: PatchWarn -> String
pprPatchWarn (HunkMatched (offs, _) poffs) =
"hunk (" ++ pprOffs offs ++ ") succeeded at " ++ pprOffs poffs
pprPatchWarn (HunkFailed (offs, _) poffs) =
"hunk (" ++ pprOffs offs ++ ") FAILED, expected at " ++ pprOffs poffs
pprOffs :: (Int, Int) -> String
pprOffs (o, n) = "-" ++ show o ++ " +" ++ show n
patchToks ::
TV
-> [Hunk]
-> Bool
-> Int
-> Int
-> MergeOpts
-> (TV, [Hunk], [PatchWarn])
patchToks toks hunks' revPatch scan ctxt mopt =
go hunks $ PatchState toks [] [] 0 0 0 0
where
hunks
| revPatch = map revHunk hunks'
| otherwise = hunks'
revHunk ((o, n), diff) = ((n, o), map revDiff diff)
revDiff (Add, t) = (Remove, t)
revDiff (Remove, t) = (Add, t)
revDiff (Keep, t) = (Keep, t)
revDiff _ = error "cannot reverse conflict diff"
go [] ps =
( V.concat (output ps ++ [V.drop (inOff ps) (input ps)])
, [rej | HunkFailed rej _ <- warns ps]
, warns ps)
go (h:hs) ps = go hs ps'
where
((fromPos, toPos), diff) = h
advance = fromPos - patchInOff ps
noMatch =
ps
{ warns =
warns ps ++
[HunkFailed h (advance + inOff ps, advance + outOff ps)]
, patchOutOff = patchOutOff ps - diffOffChange diff
}
cleanMatch :: Maybe PatchState
cleanMatch = patchHunkClean ps h mopt
isContext :: (Op, Tok) -> Bool
isContext (op, _) = op == Keep
discardedContextDiffs :: [(Int, Diff)]
discardedContextDiffs =
let (fwdCtxt, d') = span isContext diff
(revCtxt, revMid) = span isContext (reverse d')
mid = reverse revMid
discards n c@(_:r) = (n, c) : discards (n + 1) r
discards n [] = (n, []) : discards n []
in zipWith
(\(dfwd, fwd) (drev, rev) ->
(max dfwd drev, fwd ++ mid ++ reverse rev))
(discards 0 fwdCtxt)
(discards 0 revCtxt)
fuzzyHunks :: [Hunk]
fuzzyHunks = do
(discarded, ddiff) <- take (ctxt + 1) discardedContextDiffs
off <- 0 : concatMap (\x -> [-x, x]) [1 .. scan]
pure ((fromPos + discarded + off, toPos + discarded), ddiff)
fuzzyMatches =
[ (\x -> x {warns = warns x ++ [HunkMatched h fPos]}) <$>
patchHunkClean ps fh mopt
| fh@(fPos, _) <- tail fuzzyHunks -- tail omits the "clean" 0,0 one
]
ps' = head $ catMaybes (cleanMatch : fuzzyMatches) ++ [noMatch]
patchHunkClean :: PatchState -> Hunk -> MergeOpts -> Maybe PatchState
patchHunkClean ps ((fromPos, toPos), diff) mopts
| expInOff < 0 || expOutOff < 0 = Nothing
| mergeIgnoreWhitespace mopts && whitespaceOnly diff = Just ps
| Just repl <- matchDiff mopts (V.toList origPart) diff =
Just
ps
{ output = output ps ++ [skipped, V.fromList repl]
, inOff = expInOff + matchLen
, patchInOff = fromPos + matchLen
, outOff = expOutOff + replLen
, patchOutOff = toPos + replLen
}
| otherwise = Nothing
where
matchLen = diffMatchLen diff
replLen = diffReplLen diff
advance = fromPos - patchInOff ps
expInOff = advance + inOff ps
expOutOff = advance + outOff ps
skipped = V.take expInOff $ input ps
origPart = V.take matchLen $ V.drop expInOff $ input ps
whitespaceOnly :: Diff -> Bool
whitespaceOnly = all wsOnly
where
wsOnly (Keep, _) = True
wsOnly (Original, _) = True
wsOnly (_, (False, _)) = True
wsOnly _ = False
diffMatchLen :: Diff -> Int
diffMatchLen = sum . map (off . fst)
where
off Keep = 1
off Remove = 1
off Original = 1
off _ = 0
diffReplLen :: Diff -> Int
diffReplLen = sum . map (off . fst)
where
off Keep = 1
off Add = 1
off Original = 1
off _ = 0 -- tricky: the conflicts do not actually add to the diff counters
diffOffChange :: Diff -> Int
diffOffChange = sum . map (off . fst)
where
off Add = 1
off Remove = -1
off _ = 0
markToks :: MergeOpts -> Op -> Op -> [Tok]
markToks mopts x' y' = map (\s -> (True, s)) $ go x' y'
where
unmarked :: Op -> Bool
unmarked Keep = True
unmarked Add = True
unmarked Remove = True
unmarked _ = False
go :: Op -> Op -> [BS]
go x y
| x == y = []
| unmarked x = goUnmarked y
| x == MineChanged
, y /= Original = mergeYourSepStr mopts : go YourChanged y
| x == MineChanged = mergeMineSepStr mopts : go Original y
| x == Original = mergeYourSepStr mopts : go YourChanged y
| x == YourChanged = mergeCEndStr mopts : goUnmarked y
| otherwise = error "internal error in markToks"
goUnmarked :: Op -> [BS]
goUnmarked y
| unmarked y = []
| otherwise = mergeCStartStr mopts : go MineChanged y
matchDiff :: MergeOpts -> [Tok] -> Diff -> Maybe [Tok]
matchDiff mopt = go Keep
where
withMark :: Op -> Op -> [Tok] -> ([Tok] -> [Tok])
withMark prev op toks = (++) (markToks mopt prev op ++ toks)
go :: Op -> [Tok] -> Diff -> Maybe [Tok]
go prev ts ds
| null ts
, null ds = return $ markToks mopt prev Keep
| ((op, tok):ds') <- ds
, op == Add || op == MineChanged || op == YourChanged =
withMark prev op [tok] <$> go op ts ds'
| (intok:ts') <- ts
, ((op, tok):ds') <- ds
, op == Keep || op == Original
, tokCmp' mopt intok tok =
withMark
prev
op
[ if mergeKeepWhitespace mopt && not (fst intok)
then intok
else tok
] <$>
go op ts' ds'
| (intok:ts') <- ts
, ((op, tok):ds') <- ds
, op == Remove
, tokCmp' mopt intok tok = withMark prev op [] <$> go op ts' ds'
| otherwise = Nothing
tokCmp' :: MergeOpts -> Tok -> Tok -> Bool
tokCmp' MergeOpts {mergeForceWhitespace = x} = tokCmp x
tokCmp :: Bool -> Tok -> Tok -> Bool
tokCmp False (False, _) (False, _) = True -- do not force rejecting on whitespace change
tokCmp _ a b = a == b -- otherwise just compare

View file

@ -1,185 +0,0 @@
{-# LANGUAGE FlexibleContexts #-}
module Redfa where
import Control.Monad
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import Data.ByteString.UTF8 (fromString, toString)
import Data.List
import Data.Maybe
import Data.Monoid
import qualified Data.Vector as V
import Options.Applicative
import Text.Regex.TDFA
import Types
import Substr
data RedfaOption
= RedfaOptionRules [BS]
| RedfaOptionFile String
deriving (Show)
data RedfaSpec =
RedfaSpec
{ redfaStart :: Int
, redfaNames :: V.Vector BS
, redfaRules :: V.Vector [RedfaRule] --start, mid, jump, token
}
data RedfaRule =
RedfaRule
{ rrRegexStart :: Regex
, rrRegexMid :: Regex
, rrJump :: Int
, rrIsToken :: Bool
}
redfaOption :: Parser RedfaOption
redfaOption =
RedfaOptionRules <$>
some
(strOption $
metavar "RULE" <>
short 'L' <>
long "lex" <> help "Lexing rule (specify repeatedly for more rules)") <|>
RedfaOptionFile <$>
strOption
(metavar "FILE" <>
short 'l' <>
long "lexer" <> help "File from where to load the lexing rules")
redfaOptionToRuleStrings :: RedfaOption -> IO [BS]
redfaOptionToRuleStrings (RedfaOptionRules x) = return x
redfaOptionToRuleStrings (RedfaOptionFile fn) =
B8.lines <$> B.readFile fn -- TODO improve
splitFirst :: Char -> BS -> (BS, BS)
splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
redfaRuleStringToRuleStr s =
let (spec, regex) = splitFirst ':' s
(from, to) = splitFirst '>' spec
sf = B8.strip from
(cleanFrom, isToken)
| B.null sf = (sf, True)
| B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
| otherwise = (sf, True)
go
| B.null s = Nothing
| B.head s == 35 = Nothing -- # comment
| B.null regex = Just (B.empty, B.empty, spec, isToken)
| B.null to = Just (cleanFrom, cleanFrom, B.tail regex, isToken)
| otherwise =
Just (cleanFrom, B8.strip $ B.tail to, B.tail regex, isToken)
in go
unescapeRegex :: MonadFail m => BS -> m BS
unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
where
unescape' :: MonadFail m => BS -> m BB.Builder
unescape' s
| B.null s = return mempty
| B.head s == BI.c2w '\\' && B.null (B.tail s) =
fail "incomplete escape sequence"
| B.head s == BI.c2w '\\' =
let rest = B.tail s
cc = B.head rest
thechar =
BB.stringUtf8 $
case BI.w2c cc of
'`' -> "\\`"
'\'' -> "\\'"
'b' -> "\\b"
'B' -> "\\B"
'<' -> "\\<"
'>' -> "\\>"
'a' -> "\a"
'e' -> "\x1b"
'f' -> "\f"
'n' -> "\n"
'r' -> "\r"
't' -> "\t"
'\\' -> "\\\\"
a -> [a] --TODO add support for \x and \u
in (thechar <>) <$> unescape' (B.tail rest)
| otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
redfaPrepareRules :: RedfaOption -> IO RedfaSpec
redfaPrepareRules opt = do
(states, jumps, regexes, isToken) <-
unzip4 . mapMaybe redfaRuleStringToRuleStr <$>
redfaOptionToRuleStrings opt
uRegexes <- traverse unescapeRegex regexes
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
midREs <- traverse (makeRegexM . (fromString "\\`(.|\n)" <>)) uRegexes
let ids = nub . sort $ states ++ jumps
index' = fromJust . flip elemIndex ids
statesIds = map index' states
jumpsIds = map index' jumps
start = index' $ head states
rules :: [[RedfaRule]]
rules = do
stateId <- [0 .. length ids - 1]
return $ do
(a, b, (rs, rm), t) <-
zip4 statesIds jumpsIds (zip startREs midREs) isToken
guard $ a == stateId
return $ RedfaRule rs rm b t
return $ RedfaSpec start (V.fromList ids) (V.fromList rules)
redfaTokenize :: MonadFail m => RedfaSpec -> BS -> m [Tok]
redfaTokenize spec s = redfaTokenize' spec s (redfaStart spec) 0 []
redfaTokenize' ::
MonadFail m
=> RedfaSpec
-> BS
-> Int
-> Int
-> [Int]
-> m [Tok]
redfaTokenize' spec s state off visited
| off >= B.length s = pure []
| otherwise =
let (ooff, reg) =
if off == 0
then (0, rrRegexStart)
else (1, rrRegexMid)
matchString = B.drop (off - ooff) s
matches :: [(RedfaRule, (MatchOffset, MatchLength))]
matches =
filter contOK $
zip rules $ map (\x -> match (reg x) matchString) rules
contOK (RedfaRule {rrJump = j}, (off', len))
| off' /= 0 = False
| len > ooff = True
| otherwise = j `notElem` visited
in case matches of
[] ->
fail $
"Tokenization could not continue from " <>
stateStrPretty <> " at offset " <> show (off + ooff)
((rule, (_, len)):_) ->
let matchLen = len - ooff
in (if matchLen > 0
then (:) (rrIsToken rule, substr off matchLen s)
else id) <$>
redfaTokenize'
spec
s
(rrJump rule)
(off + matchLen)
(if matchLen > 0
then []
else state : visited)
where
rules = redfaRules spec V.! state
stateStr = redfaNames spec V.! state
stateStrPretty
| B.null stateStr = "anonymous state"
| otherwise = "state `" <> toString stateStr <> "'"

View file

@ -1,7 +0,0 @@
module Substr where
import Types
import qualified Data.ByteString as B
substr :: Int -> Int -> BS -> BS
substr b l = B.take l . B.drop b

114
src/Tokenizers.hs Normal file
View file

@ -0,0 +1,114 @@
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE TupleSections #-}
module Tokenizers (TokOpts, tokOptions, tokenize) where
import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as BU8
import qualified Data.Word8 as W8
import Options.Applicative
import Types
import qualified Unicode.Char.General as UC
data TokOpts =
TokOpts
{ optTokenizerName :: String
}
deriving (Show)
tokOptions :: Parser TokOpts
tokOptions =
TokOpts <$>
strOption
(metavar "LEXER" <>
short 'l' <>
long "lexer" <>
value "text" <>
help "Lexer name (use --help-lexers to list available ones)")
--TODO this should later allow choosing the lexer by suffix
tokenize :: MonadFail m => TokOpts -> FilePath -> BS -> m [Tok]
tokenize topt _ =
case filter ((== optTokenizerName topt) . tkName) tokenizers of
[t] -> runTokenizer t
_ -> const $ fail "Lexer not found"
data Tokenizer =
Tokenizer
{ tkName :: String
, tkDescription :: String
, runTokenizer :: forall m. MonadFail m =>
BS -> m [Tok]
}
tokenizers :: [Tokenizer]
tokenizers =
[ Tokenizer "lines" "works like the traditional diff" tokenizeLines
, Tokenizer "asciiwords" "separate by ASCII whitespace" tokenizeWords8
, Tokenizer "words" "separate on any UTF8 spacing" tokenizeWords
, Tokenizer
"text"
"separate groups of similar-class UTF8 characters"
tokenizeUnicode
]
tokenizeLines :: MonadFail m => BS -> m [Tok]
tokenizeLines = pure . map (True, ) . BU8.lines'
tokenizeWords8 :: MonadFail m => BS -> m [Tok]
tokenizeWords8 =
pure . map (\x -> (not $ W8.isSpace $ B.head x, x)) . bsGroupOn W8.isSpace
where
bsGroupOn f = B.groupBy (\l r -> f l == f r)
tokenizeWords :: MonadFail m => BS -> m [Tok]
tokenizeWords = pure . map makeTokenBU8 . groupOnBU8 UC.isWhiteSpace
makeTokenBU8 :: BS -> Tok
makeTokenBU8 s = if
maybe False (not . UC.isWhiteSpace . fst) (BU8.decode s)
then (True, s)
else (False, s)
{- needed?
foldri :: (Char -> Int -> a -> a) -> a -> BS -> a
foldri cons nil cs =
case decode cs of
Just (a, l) -> cons a l (foldrS cons nil $ drop l cs)
Nothing -> nil
-}
spanBU8 :: (Char -> Bool) -> BS -> (BS, BS)
spanBU8 f orig = B.splitAt (go 0 orig) orig
where
go :: Int -> BS -> Int
go n bs =
case BU8.decode bs of
Just (a, l) ->
if f a
then go (n + l) (B.drop l bs)
else n
Nothing -> n
groupByBU8 :: (Char -> Char -> Bool) -> BS -> [BS]
groupByBU8 f s =
case BU8.decode s of
Just (a, _) ->
let (g, s') = spanBU8 (f a) s
in g : groupByBU8 f s'
Nothing -> []
groupOnBU8 :: Eq a => (Char -> a) -> BS -> [BS]
groupOnBU8 f = groupByBU8 (\l r -> f l == f r)
tokenizeUnicode :: MonadFail m => BS -> m [Tok]
tokenizeUnicode = pure . map makeTokenBU8 . groupOnBU8 simpleUnicodeCategory
simpleUnicodeCategory :: Char -> Int
simpleUnicodeCategory c
| UC.isWhiteSpace c = 1
| UC.isAlphaNum c = 2
| UC.isSymbol c = 3
| UC.isPunctuation c = 4
| UC.isSeparator c = 5
| otherwise = 0

View file

@ -1,3 +1,5 @@
--{-# LANGUAGE BangPatterns #-}
module Types where
import Data.ByteString
@ -5,6 +7,7 @@ import Data.Vector
type BS = ByteString
{- TODO: all this needs to get unboxed -}
type Tok = (Bool, BS)
type TV = Vector Tok
@ -21,3 +24,9 @@ data Op
| Original
| YourChanged
deriving (Show, Eq)
data Origin
= Stable
| Mine
| Your
deriving (Show, Eq)

View file

@ -7,6 +7,7 @@ import Options.Applicative
adiffVersion :: String
adiffVersion = VERSION_adiff
versionOption :: String -> Parser (a -> a)
versionOption prog =
infoOption
(prog <>