some working version

2020-09-19 17:56:28 +02:00 · 2020-09-19 17:56:28 +02:00 · 64b5ca7ea7
parent bd4e79e064
commit 64b5ca7ea7
3 changed files with 241 additions and 28 deletions
--- a/src/Diff.hs
+++ b/src/Diff.hs
@ -32,31 +32,240 @@ data Op
  | Add
  deriving (Show, Eq)
 data DiffEnv =
  DiffEnv
    { deD1 :: BS
    , deD2 :: BS
    , deT1 :: V.Vector Tok
    , deT2 :: V.Vector Tok
    , deS :: Int
    , deE :: Int
    , deL :: Int
    , deW :: Int
    , deA :: Int
    , deB :: Int
    , deVS :: V.Vector Int
    , deVE :: V.Vector Int
    , deTrans :: Bool
    }
  deriving (Show)
 substr b e = B.take (e - b) . B.drop b
-toksMatch d1 d2 t1 t2 x y =
+toksMatch x y DiffEnv {deT1 = t1, deT2 = t2, deD1 = d1, deD2 = d2} =
-  let (tok1, (b1, e1)) = t1 V.! x
+  let (isTok1, (b1, e1)) = t1 V.! x
-      (tok2, (b2, e2)) = t2 V.! y
+      (isTok2, (b2, e2)) = t2 V.! y
-   in tok1 == tok2 && substr b1 e1 d1 == substr b2 e2 d2
+   in isTok1 == isTok2 && substr b1 e1 d1 == substr b2 e2 d2
-stripEqToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> (Int, Int, Int)
+stripEqToks :: DiffEnv -> (Int, Int, Int)
-stripEqToks d1 d2 t1 t2 = (b, l1 - e, l2 - e)
+stripEqToks de@DiffEnv {deT1 = t1, deT2 = t2} = (b, l1 - e, l2 - e)
  where
    l1 = V.length t1
    l2 = V.length t2
    firstDiff i
-      | i < l1 && i < l2 && toksMatch d1 d2 t1 t2 i i = firstDiff (i + 1)
+      | i < l1 && i < l2 && toksMatch i i de = firstDiff (i + 1)
      | otherwise = i
    b = firstDiff 0
    lastDiff i
      | l1 - i - 1 >= b &&
-          l2 - i - 1 >= b && toksMatch d1 d2 t1 t2 (l1 - i - 1) (l2 - i - 1) =
+          l2 - i - 1 >= b && toksMatch (l1 - i - 1) (l2 - i - 1) de =
        lastDiff (i + 1)
      | otherwise = i
    e = lastDiff 0
-diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)]
+diffToks :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> [(Op, Tok)]
-diffToks d1 d2 t1 t2 = do
+diffToks d1 d2 t1 t2 =
  diffToks' $
  if V.length t1 >= V.length t2
    then DiffEnv
           { deD1 = d1
           , deD2 = d2
           , deT1 = t1
           , deT2 = t2
           , deS = 0
           , deE = V.length t1
           , deL = V.length t1
           , deW = V.length t2
           , deA = 0
           , deB = V.length t2
           , deVS = V.fromList [0 .. V.length t2]
           , deVE = V.fromList $ reverse [0 .. V.length t2]
           , deTrans = False
           }
    else DiffEnv
           { deD1 = d2
           , deD2 = d1
           , deT1 = t2
           , deT2 = t1
           , deS = 0
           , deE = V.length t2
           , deL = V.length t2
           , deW = V.length t1
           , deA = 0
           , deB = V.length t1
           , deVS = V.fromList [0 .. V.length t1]
           , deVE = V.fromList $ reverse [0 .. V.length t1]
           , deTrans = True
           }
 minIndexFwd =
  V.minIndexBy
    (\x y ->
       if x <= y
         then LT
         else GT --basically normal V.minIndex
     )
 minIndexRev =
  V.minIndexBy
    (\x y ->
       if x < y
         then LT
         else GT --picks the last minimum
     )
 diffToks' :: DiffEnv -> [(Op, Tok)]
 diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
  diff
  where
    mid = (s + e) `div` 2
    vecSmid = vecS mid
    vecEmid = vecE mid
    vecS = vec -- "forward" operation
      where
        vec i
          | i == s = deVS de
          | i > s = V.fromList . upd i . vec $ pred i
          | otherwise = error "Internal bounds check failure"
        upd i v = i : go 1 i
          where
            go j up
              | j > deW de = []
              | otherwise =
                let left = v V.! j
                    upleft = v V.! pred j
                    keep
                      | toksMatch (pred i) (pred j) de = min upleft
                      | otherwise = id
                    res = keep $ min (succ up) (succ left)
                 in res : go (succ j) res
    vecE = vec -- "backward" operation
      where
        vec i
          | i == e = deVE de
          | i < e = V.fromList . reverse . upd i . vec $ succ i
          | otherwise = error "Internal bounds check failure"
        upd i v = (deL de - i) : go (pred $ deW de) (deL de - i)
          where
            go j down
              | j < 0 = []
              | otherwise =
                let right = v V.! j
                    downright = v V.! succ j
                    keep
                      | toksMatch i j de = min downright
                      | otherwise = id
                    res = keep $ min (succ down) (succ right)
                 in res : go (pred j) res
    {- Now, find the optimal point for splitting.
     -
     - Heuristics A: if 2 paths are completely same, prefer deletion first;
     - which is done by choosing the 'upper' of two possibilities
     - preferentially (or 'lower' ie 'more to the right' in case of transposed
     - matrix) -}
    opt =
      (a +) .
      (if trans
         then minIndexRev
         else minIndexFwd) $
      V.zipWith (+) (slice vecSmid) (slice vecEmid)
      where
        slice = V.slice a (succ $ b - a)
    diff
      | s > e =
        error $
        "Internal failure -- recursion off limits: " <>
        show s <> " vs " <> show e
      | s == e = map (\i -> (Add, deT2 de V.! i)) [a .. pred b]
      | succ s == e =
        let vecLS = deVS de
            vecRE = deVE de
            vecLE = vecE s
            vecRS = vecS e
            sumL = V.zipWith (+) vecLS vecLE
            sumR = V.zipWith (+) vecRS vecRE
            {- This is getting a bit complicted. In the non-transposed case, we
             - want to select one Remove/Keep surrounded by 0-n Add ops, possibly
             - from both sides. The chosen path must belong to the best paths
             - (bidirectional matrix sums must match the minimum at (s,a) and
             - (b,e) on all steps), AND at the same time the path must be
             - admissible for the edit operations (ie, it has to `backtrack
             - well`). Also, it should follow Heuristic A that says that Remove
             - and Keep ops should go earlier than Add ops (or vice versa if
             - transposed). -}
            totalCost = sumL V.! a
            sCost = vecLS V.! a
            eCost = vecRS V.! b
            doKeep
              | eCost - sCost == succ (b - a) = False
              | eCost - sCost == pred (b - a) = True
              | otherwise =
                error $
                "Internal check failure -- costs seem broken: " <>
                show [sCost, eCost, a, b]
            jumpPos =
              (if trans {- Heuristic A applies here -}
                 then last
                 else head)
                [ i
                | i <-
                    [a .. if doKeep
                            then pred b
                            else b]
                , vecLS V.! i == sCost - a + i
                , sumL V.! i == totalCost
                , if doKeep
                    then vecLS V.! i == vecRS V.! succ i
                    else succ (vecLS V.! i) == vecRS V.! i
                , if doKeep
                    then sumR V.! succ i == totalCost
                    else sumR V.! i == totalCost
                , not doKeep || toksMatch s i de
                ]
            jumpEnd =
              if doKeep
                then jumpPos + 1
                else jumpPos
         in map
              (\i ->
                 ( if trans
                     then Remove
                     else Add
                 , deT2 de V.! i))
              [a .. pred jumpPos] ++
            [ if trans
                then if doKeep
                       then (Keep, deT2 de V.! jumpPos)
                       else (Add, deT1 de V.! s)
                else ( if doKeep
                         then Keep
                         else Remove
                     , deT1 de V.! s)
            ] ++
            map
              (\i ->
                 ( if trans
                     then Remove
                     else Add
                 , deT2 de V.! i))
              [jumpEnd .. pred b]
      | otherwise =
        diffToks' de {deE = mid, deVE = vecEmid, deB = opt} ++
        diffToks' de {deS = mid, deVS = vecSmid, deA = opt}
 {-
 diffToksXX :: BS -> BS -> V.Vector Tok -> V.Vector Tok -> IO [(Op, Tok)]
 diffToksXX d1 d2 t1 t2 = do
  let (b, e1, e2) = stripEqToks d1 d2 t1 t2
      ms1 = e1 - b
      ms2 = e2 - b
@ -96,7 +305,7 @@ diffToks d1 d2 t1 t2 = do
  return $
    map ((,) Keep) (take b $ V.toList t1) ++
    diff ++ map ((,) Keep) (drop e1 $ V.toList t1)
-
+-}
 pprDiff :: BS -> BS -> [(Op, Tok)] -> [BS]
 pprDiff d1 d2 = map (pprDiff1 d1 d2)
--- a/src/MainDiff.hs
+++ b/src/MainDiff.hs
@ -71,8 +71,8 @@ patchCmdOptions =
    auto
    (short 'p' <>
     long "strip" <>
-     metavar "NUM" <> help "Strip NUM leading components from the paths" <>
+     metavar "NUM" <>
-     value 0) <*>
+     help "Strip NUM leading components from the paths" <> value 0) <*>
  mergeOption True
 diff3CmdOptions =
@ -103,11 +103,15 @@ main =
           progDesc
             "Compare, patch and merge files on arbitrarily-tokenized sequences." <>
           header "adiff: arbitrary-token diff utilities")
-   in do o <- execParser opts
+   in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
-         redfa <- redfaPrepareRules (diffRedfaOpt o)
+           execParser opts
-         data1 <- mmapFileByteString (diffFile1 o) Nothing
+         redfa <- redfaPrepareRules ropt
-         data2 <- mmapFileByteString (diffFile2 o) Nothing
+         case copt of
-         toks1 <- V.fromList <$> redfaTokenize redfa data1
+           CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
-         toks2 <- V.fromList <$> redfaTokenize redfa data2
+             data1 <- mmapFileByteString f1 Nothing
-         hs <- hunks (max 0 $ context o) <$> diffToks data1 data2 toks1 toks2
+             data2 <- mmapFileByteString f2 Nothing
-         B8.putStrLn $ pprHunks data1 data2 hs
+             toks1 <- V.fromList <$> redfaTokenize redfa data1
             toks2 <- V.fromList <$> redfaTokenize redfa data2
             let hs = hunks (max 0 ctxt) $ diffToks data1 data2 toks1 toks2
             B8.putStrLn $ pprHunks data1 data2 hs
           CmdPatch {} -> putStrLn "not supported yet"
--- a/src/Redfa.hs
+++ b/src/Redfa.hs
@ -59,7 +59,7 @@ redfaOptionToRuleStrings (RedfaOptionFile fn) =
  B8.lines <$> B.readFile fn -- TODO improve
 splitFirst :: Char -> BS -> (BS, BS)
-splitFirst c s = B.splitAt (maybe (B.length s) id $ B8.elemIndex c s) s
+splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
 redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
 redfaRuleStringToRuleStr s =
@ -68,7 +68,7 @@ redfaRuleStringToRuleStr s =
      sf = B8.strip from
      (cleanFrom, isToken)
        | B.null sf = (sf, True)
-        | B.head sf == (fromIntegral $ fromEnum '_') = (B.tail sf, False)
+        | B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
        | otherwise = (sf, True)
      go
        | B.null s = Nothing
@ -89,7 +89,7 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
        fail "incomplete escape sequence"
      | B.head s == BI.c2w '\\' =
        let rest = B.tail s
-            cc = B.head (rest)
+            cc = B.head rest
            thechar =
              BB.stringUtf8 $
              case BI.w2c cc of
@ -108,12 +108,12 @@ unescapeRegex s = BL.toStrict . BB.toLazyByteString <$> unescape' s
                '\\' -> "\\\\"
                a -> [a] --TODO add support for \x and \u
         in (thechar <>) <$> unescape' (B.tail rest)
-      | otherwise = (mappend $ BB.word8 $ B.head s) <$> unescape' (B.tail s)
+      | otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
 redfaPrepareRules :: RedfaOption -> IO RedfaSpec
 redfaPrepareRules opt = do
  (states, jumps, regexes, isToken) <-
-    unzip4 . catMaybes . fmap redfaRuleStringToRuleStr <$>
+    unzip4 . mapMaybe redfaRuleStringToRuleStr <$>
    redfaOptionToRuleStrings opt
  uRegexes <- traverse unescapeRegex regexes
  startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
@ -148,7 +148,7 @@ redfaTokenize' spec s state off visited
  | off >= B.length s = pure []
  | otherwise =
    let (ooff, reg) =
-          if (off == 0)
+          if off == 0
            then (0, rrRegexStart)
            else (1, rrRegexMid)
        matchString = B.drop (off - ooff) s
@ -159,7 +159,7 @@ redfaTokenize' spec s state off visited
        contOK (RedfaRule {rrJump = j}, (off', len))
          | off' /= 0 = False
          | len > ooff = True
-          | otherwise = not $ j `elem` visited
+          | otherwise = j `notElem` visited
     in case matches of
          [] ->
            fail $