diff --git a/src/Diff.hs b/src/Diff.hs index 01f7df6..78db394 100644 --- a/src/Diff.hs +++ b/src/Diff.hs @@ -32,31 +32,240 @@ data Op | Add deriving (Show, Eq) +data DiffEnv = + DiffEnv + { deD1 :: BS + , deD2 :: BS + , deT1 :: V.Vector Tok + , deT2 :: V.Vector Tok + , deS :: Int + , deE :: Int + , deL :: Int + , deW :: Int + , deA :: Int + , deB :: Int + , deVS :: V.Vector Int + , deVE :: V.Vector Int + , deTrans :: Bool + } + deriving (Show) + substr b e = B.take (e - b) . B.drop b -toksMatch d1 d2 t1 t2 x y = - let (tok1, (b1, e1)) = t1 V.! x - (tok2, (b2, e2)) = t2 V.! y - in tok1 == tok2 && substr b1 e1 d1 == substr b2 e2 d2 +toksMatch x y DiffEnv {deT1 = t1, deT2 = t2, deD1 = d1, deD2 = d2} = + let (isTok1, (b1, e1)) = t1 V.! x + (isTok2, (b2, e2)) = t2 V.! y + in isTok1 == isTok2 && substr b1 e1 d1 == substr b2 e2 d2 -stripEqToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> (Int, Int, Int) -stripEqToks d1 d2 t1 t2 = (b, l1 - e, l2 - e) +stripEqToks :: DiffEnv -> (Int, Int, Int) +stripEqToks de@DiffEnv {deT1 = t1, deT2 = t2} = (b, l1 - e, l2 - e) where l1 = V.length t1 l2 = V.length t2 firstDiff i - | i < l1 && i < l2 && toksMatch d1 d2 t1 t2 i i = firstDiff (i + 1) + | i < l1 && i < l2 && toksMatch i i de = firstDiff (i + 1) | otherwise = i b = firstDiff 0 lastDiff i | l1 - i - 1 >= b && - l2 - i - 1 >= b && toksMatch d1 d2 t1 t2 (l1 - i - 1) (l2 - i - 1) = + l2 - i - 1 >= b && toksMatch (l1 - i - 1) (l2 - i - 1) de = lastDiff (i + 1) | otherwise = i e = lastDiff 0 -diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)] -diffToks d1 d2 t1 t2 = do +diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> [(Op, Tok)] +diffToks d1 d2 t1 t2 = + diffToks' $ + if V.length t1 >= V.length t2 + then DiffEnv + { deD1 = d1 + , deD2 = d2 + , deT1 = t1 + , deT2 = t2 + , deS = 0 + , deE = V.length t1 + , deL = V.length t1 + , deW = V.length t2 + , deA = 0 + , deB = V.length t2 + , deVS = V.fromList [0 .. V.length t2] + , deVE = V.fromList $ reverse [0 .. V.length t2] + , deTrans = False + } + else DiffEnv + { deD1 = d2 + , deD2 = d1 + , deT1 = t2 + , deT2 = t1 + , deS = 0 + , deE = V.length t2 + , deL = V.length t2 + , deW = V.length t1 + , deA = 0 + , deB = V.length t1 + , deVS = V.fromList [0 .. V.length t1] + , deVE = V.fromList $ reverse [0 .. V.length t1] + , deTrans = True + } + +minIndexFwd = + V.minIndexBy + (\x y -> + if x <= y + then LT + else GT --basically normal V.minIndex + ) + +minIndexRev = + V.minIndexBy + (\x y -> + if x < y + then LT + else GT --picks the last minimum + ) + +diffToks' :: DiffEnv -> [(Op, Tok)] +diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} = + diff + where + mid = (s + e) `div` 2 + vecSmid = vecS mid + vecEmid = vecE mid + vecS = vec -- "forward" operation + where + vec i + | i == s = deVS de + | i > s = V.fromList . upd i . vec $ pred i + | otherwise = error "Internal bounds check failure" + upd i v = i : go 1 i + where + go j up + | j > deW de = [] + | otherwise = + let left = v V.! j + upleft = v V.! pred j + keep + | toksMatch (pred i) (pred j) de = min upleft + | otherwise = id + res = keep $ min (succ up) (succ left) + in res : go (succ j) res + vecE = vec -- "backward" operation + where + vec i + | i == e = deVE de + | i < e = V.fromList . reverse . upd i . vec $ succ i + | otherwise = error "Internal bounds check failure" + upd i v = (deL de - i) : go (pred $ deW de) (deL de - i) + where + go j down + | j < 0 = [] + | otherwise = + let right = v V.! j + downright = v V.! succ j + keep + | toksMatch i j de = min downright + | otherwise = id + res = keep $ min (succ down) (succ right) + in res : go (pred j) res + {- Now, find the optimal point for splitting. + - + - Heuristics A: if 2 paths are completely same, prefer deletion first; + - which is done by choosing the 'upper' of two possibilities + - preferentially (or 'lower' ie 'more to the right' in case of transposed + - matrix) -} + opt = + (a +) . + (if trans + then minIndexRev + else minIndexFwd) $ + V.zipWith (+) (slice vecSmid) (slice vecEmid) + where + slice = V.slice a (succ $ b - a) + diff + | s > e = + error $ + "Internal failure -- recursion off limits: " <> + show s <> " vs " <> show e + | s == e = map (\i -> (Add, deT2 de V.! i)) [a .. pred b] + | succ s == e = + let vecLS = deVS de + vecRE = deVE de + vecLE = vecE s + vecRS = vecS e + sumL = V.zipWith (+) vecLS vecLE + sumR = V.zipWith (+) vecRS vecRE + {- This is getting a bit complicted. In the non-transposed case, we + - want to select one Remove/Keep surrounded by 0-n Add ops, possibly + - from both sides. The chosen path must belong to the best paths + - (bidirectional matrix sums must match the minimum at (s,a) and + - (b,e) on all steps), AND at the same time the path must be + - admissible for the edit operations (ie, it has to `backtrack + - well`). Also, it should follow Heuristic A that says that Remove + - and Keep ops should go earlier than Add ops (or vice versa if + - transposed). -} + totalCost = sumL V.! a + sCost = vecLS V.! a + eCost = vecRS V.! b + doKeep + | eCost - sCost == succ (b - a) = False + | eCost - sCost == pred (b - a) = True + | otherwise = + error $ + "Internal check failure -- costs seem broken: " <> + show [sCost, eCost, a, b] + jumpPos = + (if trans {- Heuristic A applies here -} + then last + else head) + [ i + | i <- + [a .. if doKeep + then pred b + else b] + , vecLS V.! i == sCost - a + i + , sumL V.! i == totalCost + , if doKeep + then vecLS V.! i == vecRS V.! succ i + else succ (vecLS V.! i) == vecRS V.! i + , if doKeep + then sumR V.! succ i == totalCost + else sumR V.! i == totalCost + , not doKeep || toksMatch s i de + ] + jumpEnd = + if doKeep + then jumpPos + 1 + else jumpPos + in map + (\i -> + ( if trans + then Remove + else Add + , deT2 de V.! i)) + [a .. pred jumpPos] ++ + [ if trans + then if doKeep + then (Keep, deT2 de V.! jumpPos) + else (Add, deT1 de V.! s) + else ( if doKeep + then Keep + else Remove + , deT1 de V.! s) + ] ++ + map + (\i -> + ( if trans + then Remove + else Add + , deT2 de V.! i)) + [jumpEnd .. pred b] + | otherwise = + diffToks' de {deE = mid, deVE = vecEmid, deB = opt} ++ + diffToks' de {deS = mid, deVS = vecSmid, deA = opt} + +{- +diffToksXX :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)] +diffToksXX d1 d2 t1 t2 = do let (b, e1, e2) = stripEqToks d1 d2 t1 t2 ms1 = e1 - b ms2 = e2 - b @@ -96,7 +305,7 @@ diffToks d1 d2 t1 t2 = do return $ map ((,) Keep) (take b $ V.toList t1) ++ diff ++ map ((,) Keep) (drop e1 $ V.toList t1) - +-} pprDiff :: BS -> BS -> [(Op, Tok)] -> [BS] pprDiff d1 d2 = map (pprDiff1 d1 d2) diff --git a/src/MainDiff.hs b/src/MainDiff.hs index 33f9e7a..ea60c54 100644 --- a/src/MainDiff.hs +++ b/src/MainDiff.hs @@ -71,8 +71,8 @@ patchCmdOptions = auto (short 'p' <> long "strip" <> - metavar "NUM" <> help "Strip NUM leading components from the paths" <> - value 0) <*> + metavar "NUM" <> + help "Strip NUM leading components from the paths" <> value 0) <*> mergeOption True diff3CmdOptions = @@ -103,11 +103,15 @@ main = progDesc "Compare, patch and merge files on arbitrarily-tokenized sequences." <> header "adiff: arbitrary-token diff utilities") - in do o <- execParser opts - redfa <- redfaPrepareRules (diffRedfaOpt o) - data1 <- mmapFileByteString (diffFile1 o) Nothing - data2 <- mmapFileByteString (diffFile2 o) Nothing - toks1 <- V.fromList <$> redfaTokenize redfa data1 - toks2 <- V.fromList <$> redfaTokenize redfa data2 - hs <- hunks (max 0 $ context o) <$> diffToks data1 data2 toks1 toks2 - B8.putStrLn $ pprHunks data1 data2 hs + in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <- + execParser opts + redfa <- redfaPrepareRules ropt + case copt of + CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do + data1 <- mmapFileByteString f1 Nothing + data2 <- mmapFileByteString f2 Nothing + toks1 <- V.fromList <$> redfaTokenize redfa data1 + toks2 <- V.fromList <$> redfaTokenize redfa data2 + let hs = hunks (max 0 ctxt) $ diffToks data1 data2 toks1 toks2 + B8.putStrLn $ pprHunks data1 data2 hs + CmdPatch {} -> putStrLn "not supported yet" diff --git a/src/Redfa.hs b/src/Redfa.hs index 4f8d3e5..42b1db2 100644 --- a/src/Redfa.hs +++ b/src/Redfa.hs @@ -59,7 +59,7 @@ redfaOptionToRuleStrings (RedfaOptionFile fn) = B8.lines <$> B.readFile fn -- TODO improve splitFirst :: Char -> BS -> (BS, BS) -splitFirst c s = B.splitAt (maybe (B.length s) id $ B8.elemIndex c s) s +splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool) redfaRuleStringToRuleStr s = @@ -68,7 +68,7 @@ redfaRuleStringToRuleStr s = sf = B8.strip from (cleanFrom, isToken) | B.null sf = (sf, True) - | B.head sf == (fromIntegral $ fromEnum '_') = (B.tail sf, False) + | B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False) | otherwise = (sf, True) go | B.null s = Nothing @@ -89,7 +89,7 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s fail "incomplete escape sequence" | B.head s == BI.c2w '\\' = let rest = B.tail s - cc = B.head (rest) + cc = B.head rest thechar = BB.stringUtf8 $ case BI.w2c cc of @@ -108,12 +108,12 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s '\\' -> "\\\\" a -> [a] --TODO add support for \x and \u in (thechar <>) <$> unescape' (B.tail rest) - | otherwise = (mappend $ BB.word8 $ B.head s) <$> unescape' (B.tail s) + | otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s) redfaPrepareRules :: RedfaOption -> IO RedfaSpec redfaPrepareRules opt = do (states, jumps, regexes, isToken) <- - unzip4 . catMaybes . fmap redfaRuleStringToRuleStr <$> + unzip4 . mapMaybe redfaRuleStringToRuleStr <$> redfaOptionToRuleStrings opt uRegexes <- traverse unescapeRegex regexes startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes @@ -148,7 +148,7 @@ redfaTokenize' spec s state off visited | off >= B.length s = pure [] | otherwise = let (ooff, reg) = - if (off == 0) + if off == 0 then (0, rrRegexStart) else (1, rrRegexMid) matchString = B.drop (off - ooff) s @@ -159,7 +159,7 @@ redfaTokenize' spec s state off visited contOK (RedfaRule {rrJump = j}, (off', len)) | off' /= 0 = False | len > ooff = True - | otherwise = not $ j `elem` visited + | otherwise = j `notElem` visited in case matches of [] -> fail $