some working version

This commit is contained in:
Mirek Kratochvil 2020-09-19 17:56:28 +02:00
parent bd4e79e064
commit 64b5ca7ea7
3 changed files with 241 additions and 28 deletions

View file

@ -32,31 +32,240 @@ data Op
| Add | Add
deriving (Show, Eq) deriving (Show, Eq)
data DiffEnv =
DiffEnv
{ deD1 :: BS
, deD2 :: BS
, deT1 :: V.Vector Tok
, deT2 :: V.Vector Tok
, deS :: Int
, deE :: Int
, deL :: Int
, deW :: Int
, deA :: Int
, deB :: Int
, deVS :: V.Vector Int
, deVE :: V.Vector Int
, deTrans :: Bool
}
deriving (Show)
substr b e = B.take (e - b) . B.drop b substr b e = B.take (e - b) . B.drop b
toksMatch d1 d2 t1 t2 x y = toksMatch x y DiffEnv {deT1 = t1, deT2 = t2, deD1 = d1, deD2 = d2} =
let (tok1, (b1, e1)) = t1 V.! x let (isTok1, (b1, e1)) = t1 V.! x
(tok2, (b2, e2)) = t2 V.! y (isTok2, (b2, e2)) = t2 V.! y
in tok1 == tok2 && substr b1 e1 d1 == substr b2 e2 d2 in isTok1 == isTok2 && substr b1 e1 d1 == substr b2 e2 d2
stripEqToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> (Int, Int, Int) stripEqToks :: DiffEnv -> (Int, Int, Int)
stripEqToks d1 d2 t1 t2 = (b, l1 - e, l2 - e) stripEqToks de@DiffEnv {deT1 = t1, deT2 = t2} = (b, l1 - e, l2 - e)
where where
l1 = V.length t1 l1 = V.length t1
l2 = V.length t2 l2 = V.length t2
firstDiff i firstDiff i
| i < l1 && i < l2 && toksMatch d1 d2 t1 t2 i i = firstDiff (i + 1) | i < l1 && i < l2 && toksMatch i i de = firstDiff (i + 1)
| otherwise = i | otherwise = i
b = firstDiff 0 b = firstDiff 0
lastDiff i lastDiff i
| l1 - i - 1 >= b && | l1 - i - 1 >= b &&
l2 - i - 1 >= b && toksMatch d1 d2 t1 t2 (l1 - i - 1) (l2 - i - 1) = l2 - i - 1 >= b && toksMatch (l1 - i - 1) (l2 - i - 1) de =
lastDiff (i + 1) lastDiff (i + 1)
| otherwise = i | otherwise = i
e = lastDiff 0 e = lastDiff 0
diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)] diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> [(Op, Tok)]
diffToks d1 d2 t1 t2 = do diffToks d1 d2 t1 t2 =
diffToks' $
if V.length t1 >= V.length t2
then DiffEnv
{ deD1 = d1
, deD2 = d2
, deT1 = t1
, deT2 = t2
, deS = 0
, deE = V.length t1
, deL = V.length t1
, deW = V.length t2
, deA = 0
, deB = V.length t2
, deVS = V.fromList [0 .. V.length t2]
, deVE = V.fromList $ reverse [0 .. V.length t2]
, deTrans = False
}
else DiffEnv
{ deD1 = d2
, deD2 = d1
, deT1 = t2
, deT2 = t1
, deS = 0
, deE = V.length t2
, deL = V.length t2
, deW = V.length t1
, deA = 0
, deB = V.length t1
, deVS = V.fromList [0 .. V.length t1]
, deVE = V.fromList $ reverse [0 .. V.length t1]
, deTrans = True
}
minIndexFwd =
V.minIndexBy
(\x y ->
if x <= y
then LT
else GT --basically normal V.minIndex
)
minIndexRev =
V.minIndexBy
(\x y ->
if x < y
then LT
else GT --picks the last minimum
)
diffToks' :: DiffEnv -> [(Op, Tok)]
diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
diff
where
mid = (s + e) `div` 2
vecSmid = vecS mid
vecEmid = vecE mid
vecS = vec -- "forward" operation
where
vec i
| i == s = deVS de
| i > s = V.fromList . upd i . vec $ pred i
| otherwise = error "Internal bounds check failure"
upd i v = i : go 1 i
where
go j up
| j > deW de = []
| otherwise =
let left = v V.! j
upleft = v V.! pred j
keep
| toksMatch (pred i) (pred j) de = min upleft
| otherwise = id
res = keep $ min (succ up) (succ left)
in res : go (succ j) res
vecE = vec -- "backward" operation
where
vec i
| i == e = deVE de
| i < e = V.fromList . reverse . upd i . vec $ succ i
| otherwise = error "Internal bounds check failure"
upd i v = (deL de - i) : go (pred $ deW de) (deL de - i)
where
go j down
| j < 0 = []
| otherwise =
let right = v V.! j
downright = v V.! succ j
keep
| toksMatch i j de = min downright
| otherwise = id
res = keep $ min (succ down) (succ right)
in res : go (pred j) res
{- Now, find the optimal point for splitting.
-
- Heuristics A: if 2 paths are completely same, prefer deletion first;
- which is done by choosing the 'upper' of two possibilities
- preferentially (or 'lower' ie 'more to the right' in case of transposed
- matrix) -}
opt =
(a +) .
(if trans
then minIndexRev
else minIndexFwd) $
V.zipWith (+) (slice vecSmid) (slice vecEmid)
where
slice = V.slice a (succ $ b - a)
diff
| s > e =
error $
"Internal failure -- recursion off limits: " <>
show s <> " vs " <> show e
| s == e = map (\i -> (Add, deT2 de V.! i)) [a .. pred b]
| succ s == e =
let vecLS = deVS de
vecRE = deVE de
vecLE = vecE s
vecRS = vecS e
sumL = V.zipWith (+) vecLS vecLE
sumR = V.zipWith (+) vecRS vecRE
{- This is getting a bit complicted. In the non-transposed case, we
- want to select one Remove/Keep surrounded by 0-n Add ops, possibly
- from both sides. The chosen path must belong to the best paths
- (bidirectional matrix sums must match the minimum at (s,a) and
- (b,e) on all steps), AND at the same time the path must be
- admissible for the edit operations (ie, it has to `backtrack
- well`). Also, it should follow Heuristic A that says that Remove
- and Keep ops should go earlier than Add ops (or vice versa if
- transposed). -}
totalCost = sumL V.! a
sCost = vecLS V.! a
eCost = vecRS V.! b
doKeep
| eCost - sCost == succ (b - a) = False
| eCost - sCost == pred (b - a) = True
| otherwise =
error $
"Internal check failure -- costs seem broken: " <>
show [sCost, eCost, a, b]
jumpPos =
(if trans {- Heuristic A applies here -}
then last
else head)
[ i
| i <-
[a .. if doKeep
then pred b
else b]
, vecLS V.! i == sCost - a + i
, sumL V.! i == totalCost
, if doKeep
then vecLS V.! i == vecRS V.! succ i
else succ (vecLS V.! i) == vecRS V.! i
, if doKeep
then sumR V.! succ i == totalCost
else sumR V.! i == totalCost
, not doKeep || toksMatch s i de
]
jumpEnd =
if doKeep
then jumpPos + 1
else jumpPos
in map
(\i ->
( if trans
then Remove
else Add
, deT2 de V.! i))
[a .. pred jumpPos] ++
[ if trans
then if doKeep
then (Keep, deT2 de V.! jumpPos)
else (Add, deT1 de V.! s)
else ( if doKeep
then Keep
else Remove
, deT1 de V.! s)
] ++
map
(\i ->
( if trans
then Remove
else Add
, deT2 de V.! i))
[jumpEnd .. pred b]
| otherwise =
diffToks' de {deE = mid, deVE = vecEmid, deB = opt} ++
diffToks' de {deS = mid, deVS = vecSmid, deA = opt}
{-
diffToksXX :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)]
diffToksXX d1 d2 t1 t2 = do
let (b, e1, e2) = stripEqToks d1 d2 t1 t2 let (b, e1, e2) = stripEqToks d1 d2 t1 t2
ms1 = e1 - b ms1 = e1 - b
ms2 = e2 - b ms2 = e2 - b
@ -96,7 +305,7 @@ diffToks d1 d2 t1 t2 = do
return $ return $
map ((,) Keep) (take b $ V.toList t1) ++ map ((,) Keep) (take b $ V.toList t1) ++
diff ++ map ((,) Keep) (drop e1 $ V.toList t1) diff ++ map ((,) Keep) (drop e1 $ V.toList t1)
-}
pprDiff :: BS -> BS -> [(Op, Tok)] -> [BS] pprDiff :: BS -> BS -> [(Op, Tok)] -> [BS]
pprDiff d1 d2 = map (pprDiff1 d1 d2) pprDiff d1 d2 = map (pprDiff1 d1 d2)

View file

@ -71,8 +71,8 @@ patchCmdOptions =
auto auto
(short 'p' <> (short 'p' <>
long "strip" <> long "strip" <>
metavar "NUM" <> help "Strip NUM leading components from the paths" <> metavar "NUM" <>
value 0) <*> help "Strip NUM leading components from the paths" <> value 0) <*>
mergeOption True mergeOption True
diff3CmdOptions = diff3CmdOptions =
@ -103,11 +103,15 @@ main =
progDesc progDesc
"Compare, patch and merge files on arbitrarily-tokenized sequences." <> "Compare, patch and merge files on arbitrarily-tokenized sequences." <>
header "adiff: arbitrary-token diff utilities") header "adiff: arbitrary-token diff utilities")
in do o <- execParser opts in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
redfa <- redfaPrepareRules (diffRedfaOpt o) execParser opts
data1 <- mmapFileByteString (diffFile1 o) Nothing redfa <- redfaPrepareRules ropt
data2 <- mmapFileByteString (diffFile2 o) Nothing case copt of
toks1 <- V.fromList <$> redfaTokenize redfa data1 CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
toks2 <- V.fromList <$> redfaTokenize redfa data2 data1 <- mmapFileByteString f1 Nothing
hs <- hunks (max 0 $ context o) <$> diffToks data1 data2 toks1 toks2 data2 <- mmapFileByteString f2 Nothing
B8.putStrLn $ pprHunks data1 data2 hs toks1 <- V.fromList <$> redfaTokenize redfa data1
toks2 <- V.fromList <$> redfaTokenize redfa data2
let hs = hunks (max 0 ctxt) $ diffToks data1 data2 toks1 toks2
B8.putStrLn $ pprHunks data1 data2 hs
CmdPatch {} -> putStrLn "not supported yet"

View file

@ -59,7 +59,7 @@ redfaOptionToRuleStrings (RedfaOptionFile fn) =
B8.lines <$> B.readFile fn -- TODO improve B8.lines <$> B.readFile fn -- TODO improve
splitFirst :: Char -> BS -> (BS, BS) splitFirst :: Char -> BS -> (BS, BS)
splitFirst c s = B.splitAt (maybe (B.length s) id $ B8.elemIndex c s) s splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool) redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
redfaRuleStringToRuleStr s = redfaRuleStringToRuleStr s =
@ -68,7 +68,7 @@ redfaRuleStringToRuleStr s =
sf = B8.strip from sf = B8.strip from
(cleanFrom, isToken) (cleanFrom, isToken)
| B.null sf = (sf, True) | B.null sf = (sf, True)
| B.head sf == (fromIntegral $ fromEnum '_') = (B.tail sf, False) | B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
| otherwise = (sf, True) | otherwise = (sf, True)
go go
| B.null s = Nothing | B.null s = Nothing
@ -89,7 +89,7 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
fail "incomplete escape sequence" fail "incomplete escape sequence"
| B.head s == BI.c2w '\\' = | B.head s == BI.c2w '\\' =
let rest = B.tail s let rest = B.tail s
cc = B.head (rest) cc = B.head rest
thechar = thechar =
BB.stringUtf8 $ BB.stringUtf8 $
case BI.w2c cc of case BI.w2c cc of
@ -108,12 +108,12 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
'\\' -> "\\\\" '\\' -> "\\\\"
a -> [a] --TODO add support for \x and \u a -> [a] --TODO add support for \x and \u
in (thechar <>) <$> unescape' (B.tail rest) in (thechar <>) <$> unescape' (B.tail rest)
| otherwise = (mappend $ BB.word8 $ B.head s) <$> unescape' (B.tail s) | otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
redfaPrepareRules :: RedfaOption -> IO RedfaSpec redfaPrepareRules :: RedfaOption -> IO RedfaSpec
redfaPrepareRules opt = do redfaPrepareRules opt = do
(states, jumps, regexes, isToken) <- (states, jumps, regexes, isToken) <-
unzip4 . catMaybes . fmap redfaRuleStringToRuleStr <$> unzip4 . mapMaybe redfaRuleStringToRuleStr <$>
redfaOptionToRuleStrings opt redfaOptionToRuleStrings opt
uRegexes <- traverse unescapeRegex regexes uRegexes <- traverse unescapeRegex regexes
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
@ -148,7 +148,7 @@ redfaTokenize' spec s state off visited
| off >= B.length s = pure [] | off >= B.length s = pure []
| otherwise = | otherwise =
let (ooff, reg) = let (ooff, reg) =
if (off == 0) if off == 0
then (0, rrRegexStart) then (0, rrRegexStart)
else (1, rrRegexMid) else (1, rrRegexMid)
matchString = B.drop (off - ooff) s matchString = B.drop (off - ooff) s
@ -159,7 +159,7 @@ redfaTokenize' spec s state off visited
contOK (RedfaRule {rrJump = j}, (off', len)) contOK (RedfaRule {rrJump = j}, (off', len))
| off' /= 0 = False | off' /= 0 = False
| len > ooff = True | len > ooff = True
| otherwise = not $ j `elem` visited | otherwise = j `notElem` visited
in case matches of in case matches of
[] -> [] ->
fail $ fail $