some working version
This commit is contained in:
parent
bd4e79e064
commit
64b5ca7ea7
231
src/Diff.hs
231
src/Diff.hs
|
@ -32,31 +32,240 @@ data Op
|
||||||
| Add
|
| Add
|
||||||
deriving (Show, Eq)
|
deriving (Show, Eq)
|
||||||
|
|
||||||
|
data DiffEnv =
|
||||||
|
DiffEnv
|
||||||
|
{ deD1 :: BS
|
||||||
|
, deD2 :: BS
|
||||||
|
, deT1 :: V.Vector Tok
|
||||||
|
, deT2 :: V.Vector Tok
|
||||||
|
, deS :: Int
|
||||||
|
, deE :: Int
|
||||||
|
, deL :: Int
|
||||||
|
, deW :: Int
|
||||||
|
, deA :: Int
|
||||||
|
, deB :: Int
|
||||||
|
, deVS :: V.Vector Int
|
||||||
|
, deVE :: V.Vector Int
|
||||||
|
, deTrans :: Bool
|
||||||
|
}
|
||||||
|
deriving (Show)
|
||||||
|
|
||||||
substr b e = B.take (e - b) . B.drop b
|
substr b e = B.take (e - b) . B.drop b
|
||||||
|
|
||||||
toksMatch d1 d2 t1 t2 x y =
|
toksMatch x y DiffEnv {deT1 = t1, deT2 = t2, deD1 = d1, deD2 = d2} =
|
||||||
let (tok1, (b1, e1)) = t1 V.! x
|
let (isTok1, (b1, e1)) = t1 V.! x
|
||||||
(tok2, (b2, e2)) = t2 V.! y
|
(isTok2, (b2, e2)) = t2 V.! y
|
||||||
in tok1 == tok2 && substr b1 e1 d1 == substr b2 e2 d2
|
in isTok1 == isTok2 && substr b1 e1 d1 == substr b2 e2 d2
|
||||||
|
|
||||||
stripEqToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> (Int, Int, Int)
|
stripEqToks :: DiffEnv -> (Int, Int, Int)
|
||||||
stripEqToks d1 d2 t1 t2 = (b, l1 - e, l2 - e)
|
stripEqToks de@DiffEnv {deT1 = t1, deT2 = t2} = (b, l1 - e, l2 - e)
|
||||||
where
|
where
|
||||||
l1 = V.length t1
|
l1 = V.length t1
|
||||||
l2 = V.length t2
|
l2 = V.length t2
|
||||||
firstDiff i
|
firstDiff i
|
||||||
| i < l1 && i < l2 && toksMatch d1 d2 t1 t2 i i = firstDiff (i + 1)
|
| i < l1 && i < l2 && toksMatch i i de = firstDiff (i + 1)
|
||||||
| otherwise = i
|
| otherwise = i
|
||||||
b = firstDiff 0
|
b = firstDiff 0
|
||||||
lastDiff i
|
lastDiff i
|
||||||
| l1 - i - 1 >= b &&
|
| l1 - i - 1 >= b &&
|
||||||
l2 - i - 1 >= b && toksMatch d1 d2 t1 t2 (l1 - i - 1) (l2 - i - 1) =
|
l2 - i - 1 >= b && toksMatch (l1 - i - 1) (l2 - i - 1) de =
|
||||||
lastDiff (i + 1)
|
lastDiff (i + 1)
|
||||||
| otherwise = i
|
| otherwise = i
|
||||||
e = lastDiff 0
|
e = lastDiff 0
|
||||||
|
|
||||||
diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)]
|
diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> [(Op, Tok)]
|
||||||
diffToks d1 d2 t1 t2 = do
|
diffToks d1 d2 t1 t2 =
|
||||||
|
diffToks' $
|
||||||
|
if V.length t1 >= V.length t2
|
||||||
|
then DiffEnv
|
||||||
|
{ deD1 = d1
|
||||||
|
, deD2 = d2
|
||||||
|
, deT1 = t1
|
||||||
|
, deT2 = t2
|
||||||
|
, deS = 0
|
||||||
|
, deE = V.length t1
|
||||||
|
, deL = V.length t1
|
||||||
|
, deW = V.length t2
|
||||||
|
, deA = 0
|
||||||
|
, deB = V.length t2
|
||||||
|
, deVS = V.fromList [0 .. V.length t2]
|
||||||
|
, deVE = V.fromList $ reverse [0 .. V.length t2]
|
||||||
|
, deTrans = False
|
||||||
|
}
|
||||||
|
else DiffEnv
|
||||||
|
{ deD1 = d2
|
||||||
|
, deD2 = d1
|
||||||
|
, deT1 = t2
|
||||||
|
, deT2 = t1
|
||||||
|
, deS = 0
|
||||||
|
, deE = V.length t2
|
||||||
|
, deL = V.length t2
|
||||||
|
, deW = V.length t1
|
||||||
|
, deA = 0
|
||||||
|
, deB = V.length t1
|
||||||
|
, deVS = V.fromList [0 .. V.length t1]
|
||||||
|
, deVE = V.fromList $ reverse [0 .. V.length t1]
|
||||||
|
, deTrans = True
|
||||||
|
}
|
||||||
|
|
||||||
|
minIndexFwd =
|
||||||
|
V.minIndexBy
|
||||||
|
(\x y ->
|
||||||
|
if x <= y
|
||||||
|
then LT
|
||||||
|
else GT --basically normal V.minIndex
|
||||||
|
)
|
||||||
|
|
||||||
|
minIndexRev =
|
||||||
|
V.minIndexBy
|
||||||
|
(\x y ->
|
||||||
|
if x < y
|
||||||
|
then LT
|
||||||
|
else GT --picks the last minimum
|
||||||
|
)
|
||||||
|
|
||||||
|
diffToks' :: DiffEnv -> [(Op, Tok)]
|
||||||
|
diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
||||||
|
diff
|
||||||
|
where
|
||||||
|
mid = (s + e) `div` 2
|
||||||
|
vecSmid = vecS mid
|
||||||
|
vecEmid = vecE mid
|
||||||
|
vecS = vec -- "forward" operation
|
||||||
|
where
|
||||||
|
vec i
|
||||||
|
| i == s = deVS de
|
||||||
|
| i > s = V.fromList . upd i . vec $ pred i
|
||||||
|
| otherwise = error "Internal bounds check failure"
|
||||||
|
upd i v = i : go 1 i
|
||||||
|
where
|
||||||
|
go j up
|
||||||
|
| j > deW de = []
|
||||||
|
| otherwise =
|
||||||
|
let left = v V.! j
|
||||||
|
upleft = v V.! pred j
|
||||||
|
keep
|
||||||
|
| toksMatch (pred i) (pred j) de = min upleft
|
||||||
|
| otherwise = id
|
||||||
|
res = keep $ min (succ up) (succ left)
|
||||||
|
in res : go (succ j) res
|
||||||
|
vecE = vec -- "backward" operation
|
||||||
|
where
|
||||||
|
vec i
|
||||||
|
| i == e = deVE de
|
||||||
|
| i < e = V.fromList . reverse . upd i . vec $ succ i
|
||||||
|
| otherwise = error "Internal bounds check failure"
|
||||||
|
upd i v = (deL de - i) : go (pred $ deW de) (deL de - i)
|
||||||
|
where
|
||||||
|
go j down
|
||||||
|
| j < 0 = []
|
||||||
|
| otherwise =
|
||||||
|
let right = v V.! j
|
||||||
|
downright = v V.! succ j
|
||||||
|
keep
|
||||||
|
| toksMatch i j de = min downright
|
||||||
|
| otherwise = id
|
||||||
|
res = keep $ min (succ down) (succ right)
|
||||||
|
in res : go (pred j) res
|
||||||
|
{- Now, find the optimal point for splitting.
|
||||||
|
-
|
||||||
|
- Heuristics A: if 2 paths are completely same, prefer deletion first;
|
||||||
|
- which is done by choosing the 'upper' of two possibilities
|
||||||
|
- preferentially (or 'lower' ie 'more to the right' in case of transposed
|
||||||
|
- matrix) -}
|
||||||
|
opt =
|
||||||
|
(a +) .
|
||||||
|
(if trans
|
||||||
|
then minIndexRev
|
||||||
|
else minIndexFwd) $
|
||||||
|
V.zipWith (+) (slice vecSmid) (slice vecEmid)
|
||||||
|
where
|
||||||
|
slice = V.slice a (succ $ b - a)
|
||||||
|
diff
|
||||||
|
| s > e =
|
||||||
|
error $
|
||||||
|
"Internal failure -- recursion off limits: " <>
|
||||||
|
show s <> " vs " <> show e
|
||||||
|
| s == e = map (\i -> (Add, deT2 de V.! i)) [a .. pred b]
|
||||||
|
| succ s == e =
|
||||||
|
let vecLS = deVS de
|
||||||
|
vecRE = deVE de
|
||||||
|
vecLE = vecE s
|
||||||
|
vecRS = vecS e
|
||||||
|
sumL = V.zipWith (+) vecLS vecLE
|
||||||
|
sumR = V.zipWith (+) vecRS vecRE
|
||||||
|
{- This is getting a bit complicted. In the non-transposed case, we
|
||||||
|
- want to select one Remove/Keep surrounded by 0-n Add ops, possibly
|
||||||
|
- from both sides. The chosen path must belong to the best paths
|
||||||
|
- (bidirectional matrix sums must match the minimum at (s,a) and
|
||||||
|
- (b,e) on all steps), AND at the same time the path must be
|
||||||
|
- admissible for the edit operations (ie, it has to `backtrack
|
||||||
|
- well`). Also, it should follow Heuristic A that says that Remove
|
||||||
|
- and Keep ops should go earlier than Add ops (or vice versa if
|
||||||
|
- transposed). -}
|
||||||
|
totalCost = sumL V.! a
|
||||||
|
sCost = vecLS V.! a
|
||||||
|
eCost = vecRS V.! b
|
||||||
|
doKeep
|
||||||
|
| eCost - sCost == succ (b - a) = False
|
||||||
|
| eCost - sCost == pred (b - a) = True
|
||||||
|
| otherwise =
|
||||||
|
error $
|
||||||
|
"Internal check failure -- costs seem broken: " <>
|
||||||
|
show [sCost, eCost, a, b]
|
||||||
|
jumpPos =
|
||||||
|
(if trans {- Heuristic A applies here -}
|
||||||
|
then last
|
||||||
|
else head)
|
||||||
|
[ i
|
||||||
|
| i <-
|
||||||
|
[a .. if doKeep
|
||||||
|
then pred b
|
||||||
|
else b]
|
||||||
|
, vecLS V.! i == sCost - a + i
|
||||||
|
, sumL V.! i == totalCost
|
||||||
|
, if doKeep
|
||||||
|
then vecLS V.! i == vecRS V.! succ i
|
||||||
|
else succ (vecLS V.! i) == vecRS V.! i
|
||||||
|
, if doKeep
|
||||||
|
then sumR V.! succ i == totalCost
|
||||||
|
else sumR V.! i == totalCost
|
||||||
|
, not doKeep || toksMatch s i de
|
||||||
|
]
|
||||||
|
jumpEnd =
|
||||||
|
if doKeep
|
||||||
|
then jumpPos + 1
|
||||||
|
else jumpPos
|
||||||
|
in map
|
||||||
|
(\i ->
|
||||||
|
( if trans
|
||||||
|
then Remove
|
||||||
|
else Add
|
||||||
|
, deT2 de V.! i))
|
||||||
|
[a .. pred jumpPos] ++
|
||||||
|
[ if trans
|
||||||
|
then if doKeep
|
||||||
|
then (Keep, deT2 de V.! jumpPos)
|
||||||
|
else (Add, deT1 de V.! s)
|
||||||
|
else ( if doKeep
|
||||||
|
then Keep
|
||||||
|
else Remove
|
||||||
|
, deT1 de V.! s)
|
||||||
|
] ++
|
||||||
|
map
|
||||||
|
(\i ->
|
||||||
|
( if trans
|
||||||
|
then Remove
|
||||||
|
else Add
|
||||||
|
, deT2 de V.! i))
|
||||||
|
[jumpEnd .. pred b]
|
||||||
|
| otherwise =
|
||||||
|
diffToks' de {deE = mid, deVE = vecEmid, deB = opt} ++
|
||||||
|
diffToks' de {deS = mid, deVS = vecSmid, deA = opt}
|
||||||
|
|
||||||
|
{-
|
||||||
|
diffToksXX :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)]
|
||||||
|
diffToksXX d1 d2 t1 t2 = do
|
||||||
let (b, e1, e2) = stripEqToks d1 d2 t1 t2
|
let (b, e1, e2) = stripEqToks d1 d2 t1 t2
|
||||||
ms1 = e1 - b
|
ms1 = e1 - b
|
||||||
ms2 = e2 - b
|
ms2 = e2 - b
|
||||||
|
@ -96,7 +305,7 @@ diffToks d1 d2 t1 t2 = do
|
||||||
return $
|
return $
|
||||||
map ((,) Keep) (take b $ V.toList t1) ++
|
map ((,) Keep) (take b $ V.toList t1) ++
|
||||||
diff ++ map ((,) Keep) (drop e1 $ V.toList t1)
|
diff ++ map ((,) Keep) (drop e1 $ V.toList t1)
|
||||||
|
-}
|
||||||
pprDiff :: BS -> BS -> [(Op, Tok)] -> [BS]
|
pprDiff :: BS -> BS -> [(Op, Tok)] -> [BS]
|
||||||
pprDiff d1 d2 = map (pprDiff1 d1 d2)
|
pprDiff d1 d2 = map (pprDiff1 d1 d2)
|
||||||
|
|
||||||
|
|
|
@ -71,8 +71,8 @@ patchCmdOptions =
|
||||||
auto
|
auto
|
||||||
(short 'p' <>
|
(short 'p' <>
|
||||||
long "strip" <>
|
long "strip" <>
|
||||||
metavar "NUM" <> help "Strip NUM leading components from the paths" <>
|
metavar "NUM" <>
|
||||||
value 0) <*>
|
help "Strip NUM leading components from the paths" <> value 0) <*>
|
||||||
mergeOption True
|
mergeOption True
|
||||||
|
|
||||||
diff3CmdOptions =
|
diff3CmdOptions =
|
||||||
|
@ -103,11 +103,15 @@ main =
|
||||||
progDesc
|
progDesc
|
||||||
"Compare, patch and merge files on arbitrarily-tokenized sequences." <>
|
"Compare, patch and merge files on arbitrarily-tokenized sequences." <>
|
||||||
header "adiff: arbitrary-token diff utilities")
|
header "adiff: arbitrary-token diff utilities")
|
||||||
in do o <- execParser opts
|
in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
|
||||||
redfa <- redfaPrepareRules (diffRedfaOpt o)
|
execParser opts
|
||||||
data1 <- mmapFileByteString (diffFile1 o) Nothing
|
redfa <- redfaPrepareRules ropt
|
||||||
data2 <- mmapFileByteString (diffFile2 o) Nothing
|
case copt of
|
||||||
toks1 <- V.fromList <$> redfaTokenize redfa data1
|
CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
|
||||||
toks2 <- V.fromList <$> redfaTokenize redfa data2
|
data1 <- mmapFileByteString f1 Nothing
|
||||||
hs <- hunks (max 0 $ context o) <$> diffToks data1 data2 toks1 toks2
|
data2 <- mmapFileByteString f2 Nothing
|
||||||
B8.putStrLn $ pprHunks data1 data2 hs
|
toks1 <- V.fromList <$> redfaTokenize redfa data1
|
||||||
|
toks2 <- V.fromList <$> redfaTokenize redfa data2
|
||||||
|
let hs = hunks (max 0 ctxt) $ diffToks data1 data2 toks1 toks2
|
||||||
|
B8.putStrLn $ pprHunks data1 data2 hs
|
||||||
|
CmdPatch {} -> putStrLn "not supported yet"
|
||||||
|
|
14
src/Redfa.hs
14
src/Redfa.hs
|
@ -59,7 +59,7 @@ redfaOptionToRuleStrings (RedfaOptionFile fn) =
|
||||||
B8.lines <$> B.readFile fn -- TODO improve
|
B8.lines <$> B.readFile fn -- TODO improve
|
||||||
|
|
||||||
splitFirst :: Char -> BS -> (BS, BS)
|
splitFirst :: Char -> BS -> (BS, BS)
|
||||||
splitFirst c s = B.splitAt (maybe (B.length s) id $ B8.elemIndex c s) s
|
splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
|
||||||
|
|
||||||
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
|
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
|
||||||
redfaRuleStringToRuleStr s =
|
redfaRuleStringToRuleStr s =
|
||||||
|
@ -68,7 +68,7 @@ redfaRuleStringToRuleStr s =
|
||||||
sf = B8.strip from
|
sf = B8.strip from
|
||||||
(cleanFrom, isToken)
|
(cleanFrom, isToken)
|
||||||
| B.null sf = (sf, True)
|
| B.null sf = (sf, True)
|
||||||
| B.head sf == (fromIntegral $ fromEnum '_') = (B.tail sf, False)
|
| B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
|
||||||
| otherwise = (sf, True)
|
| otherwise = (sf, True)
|
||||||
go
|
go
|
||||||
| B.null s = Nothing
|
| B.null s = Nothing
|
||||||
|
@ -89,7 +89,7 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
|
||||||
fail "incomplete escape sequence"
|
fail "incomplete escape sequence"
|
||||||
| B.head s == BI.c2w '\\' =
|
| B.head s == BI.c2w '\\' =
|
||||||
let rest = B.tail s
|
let rest = B.tail s
|
||||||
cc = B.head (rest)
|
cc = B.head rest
|
||||||
thechar =
|
thechar =
|
||||||
BB.stringUtf8 $
|
BB.stringUtf8 $
|
||||||
case BI.w2c cc of
|
case BI.w2c cc of
|
||||||
|
@ -108,12 +108,12 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
|
||||||
'\\' -> "\\\\"
|
'\\' -> "\\\\"
|
||||||
a -> [a] --TODO add support for \x and \u
|
a -> [a] --TODO add support for \x and \u
|
||||||
in (thechar <>) <$> unescape' (B.tail rest)
|
in (thechar <>) <$> unescape' (B.tail rest)
|
||||||
| otherwise = (mappend $ BB.word8 $ B.head s) <$> unescape' (B.tail s)
|
| otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
|
||||||
|
|
||||||
redfaPrepareRules :: RedfaOption -> IO RedfaSpec
|
redfaPrepareRules :: RedfaOption -> IO RedfaSpec
|
||||||
redfaPrepareRules opt = do
|
redfaPrepareRules opt = do
|
||||||
(states, jumps, regexes, isToken) <-
|
(states, jumps, regexes, isToken) <-
|
||||||
unzip4 . catMaybes . fmap redfaRuleStringToRuleStr <$>
|
unzip4 . mapMaybe redfaRuleStringToRuleStr <$>
|
||||||
redfaOptionToRuleStrings opt
|
redfaOptionToRuleStrings opt
|
||||||
uRegexes <- traverse unescapeRegex regexes
|
uRegexes <- traverse unescapeRegex regexes
|
||||||
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
|
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
|
||||||
|
@ -148,7 +148,7 @@ redfaTokenize' spec s state off visited
|
||||||
| off >= B.length s = pure []
|
| off >= B.length s = pure []
|
||||||
| otherwise =
|
| otherwise =
|
||||||
let (ooff, reg) =
|
let (ooff, reg) =
|
||||||
if (off == 0)
|
if off == 0
|
||||||
then (0, rrRegexStart)
|
then (0, rrRegexStart)
|
||||||
else (1, rrRegexMid)
|
else (1, rrRegexMid)
|
||||||
matchString = B.drop (off - ooff) s
|
matchString = B.drop (off - ooff) s
|
||||||
|
@ -159,7 +159,7 @@ redfaTokenize' spec s state off visited
|
||||||
contOK (RedfaRule {rrJump = j}, (off', len))
|
contOK (RedfaRule {rrJump = j}, (off', len))
|
||||||
| off' /= 0 = False
|
| off' /= 0 = False
|
||||||
| len > ooff = True
|
| len > ooff = True
|
||||||
| otherwise = not $ j `elem` visited
|
| otherwise = j `notElem` visited
|
||||||
in case matches of
|
in case matches of
|
||||||
[] ->
|
[] ->
|
||||||
fail $
|
fail $
|
||||||
|
|
Loading…
Reference in a new issue