a bit of a revamp

2022-05-25 08:30:09 +02:00 · 2022-05-25 08:30:09 +02:00 · 6e2ab88961
parent 441e2b4351
commit 6e2ab88961
15 changed files with 423 additions and 289 deletions
--- a/README.md
+++ b/README.md
@ -12,10 +12,14 @@ standard unix `diff` and related tools.
 The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who
 produced a working software package in C++, and designed the Regex-edged DFAs
 (REDFAs) that were used for user-specifiable tokenization of the input. The
-work on the corresponding thesis is currently ongoing.
+work on the corresponding thesis is finished.

 This started as a simple Haskell port of that work, and packed some relatively
-orthogonal improvements (mainly the histogram-style diffing).
+orthogonal improvements (mainly the histogram-style diffing). I later got rid
+of the REDFA concept -- while super-interesting and useful in theory, I didn't
+find a sufficiently universal way to build good lexers from user-specified
+strings.  Having a proper Regex representation library (so that e.g.
+reconstructing Flex is easy) would help a lot.

 ### TODO list

@ -23,16 +27,12 @@ orthogonal improvements (mainly the histogram-style diffing).
  of hunk context. `diff` and `diff3` works.
 - Implement a splitting heuristic for diffs, so that diffing of large files
  doesn't take aeons
- Check whether REDFA can even be implemented correctly with current Haskell
-  libraries (most regex libraries target a completely different). Taking the
-  lexer specification format from `alex` currently seems like a much better
-  option. Deferring the task unix-ishly to another program could work too.
+- check if we can have external lexers, unix-style

 # How-To

 Install using `cabal`. The `adiff` program has 3 sub-commands that work like
-`diff`, `patch` and `diff3`. It expects a lexing specification on the input;
-there are several very simple example lexers in `lexers/`.
+`diff`, `patch` and `diff3`.

 ## Example

@ -49,15 +49,11 @@ Patching is hard. I still cannot rhyme.
 ```

 Let's use the `words` lexer, which marks everything whitespace-ish as
-whitespace, and picks up groups of non-whitespace "content" characters:
-```
-:[^ \t\n]*
-_:[ \t\n]*
-```
+whitespace, and picks up groups of non-whitespace "content" characters.

 Diffing the 2 files gets done as such:
 ```
- $ cabal run adiff -- -l lexers/words diff orig mine
+ $ cabal run adiff -- -l words diff orig mine
 ```

 You should get something like this:
@ -99,7 +95,7 @@ I cannot do verses.
 We can run `diff3` to get a patch with both changes, optionally with reduced
 context:
 ```
- $ cabal run adiff -- -l lexers/words diff3 mine orig yours -C1
+ $ cabal run adiff -- -l words diff3 mine orig yours -C1
 ```
 ...which outputs:
 ```
--- a/adiff.cabal
+++ b/adiff.cabal
@ -51,8 +51,7 @@ executable adiff
                 Hunks,
                 Merge,
                 Patch,
-                 Redfa,
-                 Substr,
+                 Tokenizers,
                 Types,
                 Version

@ -60,13 +59,15 @@ executable adiff
  other-extensions: CPP

  -- Other library packages from which modules are imported.
-  build-depends: base ^>=4.13.0.0,
+  build-depends: base ^>=4.15.0.0,
+                 attoparsec ^>=0.14,
                 extra ^>= 1.7,
                 mmap ^>=0.5,
-                 regex-tdfa ^>= 1.3,
                 optparse-applicative ^>=0.16,
-                 bytestring ^>= 0.10.12,
+                 bytestring ^>= 0.11.2,
                 vector ^>=0.12,
+                 word8 ^>=0.1,
+                 unicode-data ^>=0.3,
                 utf8-string ^>=1.0

  -- Directories containing source files.
--- a/lexers/letters
+++ b/lexers/letters
@ -1,2 +0,0 @@
-[a-z]
-_:[ \n]
--- a/lexers/lines
+++ b/lexers/lines
@ -1,2 +0,0 @@
-[^\n]*\n
-[^\n]*
--- a/lexers/nums
+++ b/lexers/nums
@ -1,2 +0,0 @@
-[0-9]
-_:\n
--- a/lexers/words
+++ b/lexers/words
@ -1,2 +0,0 @@
-:[^ \t\n]*
-_:[ \t\n]*
--- a/src/Diff.hs
+++ b/src/Diff.hs
@ -20,9 +20,9 @@ data DiffEnv =
    , deB :: Int
    , deVS :: V.Vector (Int, Int)
    , deVE :: V.Vector (Int, Int)
+    , deTokPrio :: Tok -> Int
    , deTrans :: Bool
    }
-  deriving (Show)

 toksMatch :: Int -> Int -> DiffEnv -> Bool
 toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y
@ -47,10 +47,19 @@ stripEqToks t1 t2 = (pre, post, t1', t2')
    t1' = V.slice b (l1 - e - b) t1
    t2' = V.slice b (l2 - e - b) t2

+makePrios :: TV -> TV -> (Bool, BS) -> Int
+makePrios _ _ = get
+  where
+    get (isToken, str) =
+      if isToken
+        then B.length str
+        else 0
+
 diffToks :: TV -> TV -> Diff
 diffToks t1' t2' = pre ++ res ++ post
  where
    (pre, post, t1, t2) = stripEqToks t1' t2'
+    stats = makePrios t1' t2'
    res
      | V.null t1 = map (Add, ) (V.toList t2)
      | V.null t2 = map (Remove, ) (V.toList t1)
@ -67,6 +76,7 @@ diffToks t1' t2' = pre ++ res ++ post
          , deB = V.length t2
          , deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0)
          , deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0)
+          , deTokPrio = stats
          , deTrans = False
          }
      | otherwise =
@ -82,6 +92,7 @@ diffToks t1' t2' = pre ++ res ++ post
          , deB = V.length t1
          , deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0)
          , deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0)
+          , deTokPrio = stats
          , deTrans = True
          }

@ -110,12 +121,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
    mid = quot (s + e) 2
    vecSmid = vecS mid
    vecEmid = vecE mid
-    extraScore i =
-      if isToken
-        then -(B.length str)
-        else 0
-      where
-        (isToken, str) = deT1 de V.! i
+    prio i = negate . deTokPrio de $ deT1 de V.! i
    vecS = vec -- "forward" operation
      where
        vec i
@ -131,7 +137,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
                    (iupleft, supleft) = v V.! pred j
                    keep
                      | toksMatch (pred i) (pred j) de =
-                        min (iupleft, supleft + extraScore (pred i))
+                        min (iupleft, supleft + prio (pred i))
                      | otherwise = id
                    res = keep $ min (succ iup, sup) (succ ileft, sleft)
                 in res : go (succ j) res
@ -150,7 +156,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
                    (idownright, sdownright) = v V.! succ j
                    keep
                      | toksMatch i j de =
-                        min (idownright, sdownright + extraScore i)
+                        min (idownright, sdownright + prio i)
                      | otherwise = id
                    res = keep $ min (succ idown, sdown) (succ iright, sright)
                 in res : go (pred j) res
@ -213,7 +219,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
                , fst (vecLS V.! i) == fst sCost - a + i
                , sumL V.! i == totalCost
                , if doKeep
-                    then scoreAdd (vecLS V.! i) (0, extraScore s) ==
+                    then scoreAdd (vecLS V.! i) (0, prio s) ==
                         vecRS V.! succ i
                    else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i
                , if doKeep
@ -223,7 +229,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
                ]
            jumpEnd =
              if doKeep
-                then jumpPos + 1
+                then succ jumpPos
                else jumpPos
         in map
              (\i ->
--- a/src/Format.hs
+++ b/src/Format.hs
@ -1,30 +1,39 @@
-{-# LANGUAGE OverloadedStrings #-}
-
 module Format
  ( pprHunks
  , pprHunk
-  , pprDiff1
+  , parsePatch
  ) where

 import Types

+import Control.Applicative
+import Control.Monad
+import Data.Attoparsec.ByteString as A
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Builder as BB
 import qualified Data.ByteString.Internal as BI
+import qualified Data.ByteString.Lazy as BL
 import Data.String
+import Data.Word8 as W8

-pprHunkHdr :: Int -> Int -> BB.Builder
-pprHunkHdr i j = fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@"
+backslash :: Word8
+backslash = BI.c2w '\\'

-lineSep :: BB.Builder
-lineSep = fromString "\n"
+newline :: Word8
+newline = BI.c2w '\n'

 pprHunks :: [Hunk] -> BB.Builder
 pprHunks = mconcat . map pprHunk

+lineSep :: BB.Builder
+lineSep = BB.word8 newline
+
+pprHunkHdr :: Int -> Int -> BB.Builder
+pprHunkHdr i j =
+  (fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@") <> lineSep
+
 pprHunk :: Hunk -> BB.Builder
-pprHunk ((i, j), toks) =
-  mconcat ((pprHunkHdr i j <> lineSep) : map pprDiff1 toks)
+pprHunk ((i, j), toks) = mconcat (pprHunkHdr i j : map pprDiff1 toks)

 pprDiff1 :: (Op, Tok) -> BB.Builder
 pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
@ -46,6 +55,74 @@ pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
 escNewlines :: BS -> BB.Builder
 escNewlines s
  | B.null s = mempty
-  | B.head s == BI.c2w '\n' = BB.stringUtf8 "\\n" <> escNewlines (B.tail s)
-  | B.head s == BI.c2w '\\' = BB.stringUtf8 "\\\\" <> escNewlines (B.tail s)
+  | B.head s == newline =
+    BB.word8 backslash <> BB.word8 (BI.c2w 'n') <> escNewlines (B.tail s)
+  | B.head s == backslash =
+    BB.word8 backslash <> BB.word8 backslash <> escNewlines (B.tail s)
  | otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s)
+
+--parsePatch :: BS -> Either String [Hunk]
+parsePatch = parseOnly parseHunks
+
+parseHunks :: Parser [Hunk]
+parseHunks = many parseHunk <* endOfInput
+
+parseHunk :: Parser Hunk
+parseHunk = liftA2 (,) parseHunkHdr (many parseDiff1)
+
+parseInt :: Parser Int
+parseInt = read . map BI.w2c <$> some (satisfy W8.isDigit)
+
+eol :: Parser ()
+eol = void $ word8 newline
+
+parseHunkHdr :: Parser (Int, Int)
+parseHunkHdr = do
+  void . string $ fromString "@@ -"
+  i <- parseInt
+  void . string $ fromString " +"
+  j <- parseInt
+  void . string $ fromString " @@"
+  eol
+  return (i, j)
+
+pairs2parsers :: [(a, Char)] -> [Parser a]
+pairs2parsers = map (\(x, ch) -> x <$ word8 (BI.c2w ch))
+
+parseOpList :: [Parser Op]
+parseOpList =
+  pairs2parsers
+    [ (Add, '+')
+    , (Keep, ' ')
+    , (Remove, '-')
+    , (MineChanged, '<')
+    , (Original, '=')
+    , (YourChanged, '>')
+    ]
+
+parseOp :: Parser Op
+parseOp = choice parseOpList
+
+parseTokMarkList :: [Parser Bool]
+parseTokMarkList = pairs2parsers [(True, '|'), (False, '.')]
+
+parseTokMark :: Parser Bool
+parseTokMark = choice parseTokMarkList
+
+parseTokBS :: Parser BS
+parseTokBS =
+  (BL.toStrict . BB.toLazyByteString . mconcat <$> many parseTokChar) <* eol
+
+parseTokChar :: Parser BB.Builder
+parseTokChar =
+  choice
+    [ BB.word8 newline <$ string (fromString "\\n")
+    , BB.word8 backslash <$ string (fromString "\\\\")
+    , BB.word8 <$> satisfy (\w -> w /= backslash && w /= newline)
+    ]
+
+parseTok :: Parser Tok
+parseTok = liftA2 (,) parseTokMark parseTokBS
+
+parseDiff1 :: Parser (Op, Tok)
+parseDiff1 = liftA2 (,) parseOp parseTok
--- a/src/Main.hs
+++ b/src/Main.hs
@ -1,6 +1,10 @@
 module Main where

+import Control.Exception
+import Control.Monad
+import qualified Data.ByteString as B
 import qualified Data.ByteString.Builder as BB
+import Data.Foldable (traverse_)
 import qualified Data.Vector as V
 import Diff
 import Diff3
@ -8,15 +12,17 @@ import Format
 import Hunks
 import Merge
 import Options.Applicative
-import Redfa
-import System.IO (stdout)
+import Patch
+import System.Exit
+import System.IO (hPutStrLn, stderr, stdout)
 import System.IO.MMap
+import Tokenizers
 import Types
 import Version

 data ADiffOptions =
  ADiffOptions
-    { adiffRedfaOpt :: RedfaOption
+    { adiffTokOpts :: TokOpts
    , adiffCmdOpts :: ADiffCommandOpts
    }
  deriving (Show)
@ -29,11 +35,16 @@ data ADiffCommandOpts
      }
  | CmdPatch
      { patchDryRun :: Bool
-      , patchInDir :: Maybe String
-      , patchInput :: String
+      --, patchInDir :: Maybe String
+      --, patchPathStrip :: Int
+      , patchInputPatch :: String
+      , patchOutput :: String
+      , patchReject :: String --todo convert to Maybes with optional
+      , patchBackup :: String
      , patchReverse :: Bool
-      , patchPathStrip :: Int
+      , context :: Int
      , patchMergeOpts :: MergeOpts
+      , patchInput :: String
      }
  | CmdDiff3
      { context :: Int
@ -44,15 +55,22 @@ data ADiffCommandOpts
      }
  deriving (Show)

-contextOpt :: Parser Int
-contextOpt =
+contextOpt :: Bool -> Parser Int
+contextOpt forPatch =
  check <$>
  option
    auto
    (metavar "CONTEXT" <>
     short 'C' <>
     long "context" <>
-     value 5 <> help "How many tokens around the change to include in the patch")
+     value
+       (if forPatch
+          then 4
+          else 8) <>
+     help
+       (if forPatch
+          then "Minimum amount of context tokens that must match so that the hunk is applied"
+          else "How many tokens around the change to include in the patch"))
  where
    check c
      | c < 0 = error "Negative context"
@ -60,7 +78,7 @@ contextOpt =

 diffCmdOptions :: Parser ADiffCommandOpts
 diffCmdOptions =
-  CmdDiff <$> contextOpt <*> strArgument (metavar "FROMFILE") <*>
+  CmdDiff <$> contextOpt False <*> strArgument (metavar "FROMFILE") <*>
  strArgument (metavar "TOFILE")

 patchCmdOptions :: Parser ADiffCommandOpts
@ -70,33 +88,49 @@ patchCmdOptions =
    (short 'n' <>
     long "dry-run" <>
     help "Do not patch anything, just print what would happen") <*>
-  optional
-    (strOption $
-     short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
+  -- optional (strOption $ short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
+  --option auto (short 'p' <> long "strip" <> metavar "NUM" <> help "Strip NUM leading components from the paths" <> value 0) <*>
  strOption
    (short 'i' <>
     long "input" <>
-     metavar "INPUT" <>
-     help "Read the patchfile from INPUT, defaults to `-' for STDIN" <>
+     metavar "PATCHFILE" <>
+     help "Read the patchfile from PATCHFILE, defaults to `-' for STDIN" <>
     value "-") <*>
+  strOption
+    (short 'o' <>
+     long "output" <>
+     metavar "OUTPUT" <>
+     help
+       "Write the patched file to OUTPUT, use `-' for STDOUT. By default, INPUT is rewritten." <>
+     value "") <*>
+  strOption
+    (short 'r' <>
+     long "reject" <>
+     metavar "REJECTS" <>
+     help
+       "Write the rejected hunks file to file REJECTS, instead of default `OUTPUT.rej'. Use `-' to discard rejects." <>
+     value "") <*>
+  strOption
+    (short 'b' <>
+     long "backup" <>
+     metavar "BACKUP" <>
+     help
+       "When rewriting INPUT after a partially applied or otherwise suspicious patch, back up the original file in BACKUP instead of default `INPUT.orig'. Use `-' to discard backups." <>
+     value "") <*>
  switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*>
-  option
-    auto
-    (short 'p' <>
-     long "strip" <>
-     metavar "NUM" <>
-     help "Strip NUM leading components from the paths" <> value 0) <*>
-  mergeOption True
+  contextOpt True <*>
+  mergeOption True <*>
+  strArgument (metavar "INPUT")

 diff3CmdOptions :: Parser ADiffCommandOpts
 diff3CmdOptions =
-  CmdDiff3 <$> contextOpt <*> strArgument (metavar "MYFILE") <*>
+  CmdDiff3 <$> contextOpt False <*> strArgument (metavar "MYFILE") <*>
  strArgument (metavar "OLDFILE") <*>
  strArgument (metavar "YOURFILE") <*>
  mergeOption False

-actionOption :: Parser ADiffCommandOpts
-actionOption =
+actionOptions :: Parser ADiffCommandOpts
+actionOptions =
  hsubparser $
  mconcat
    [ command "diff" $ info diffCmdOptions $ progDesc "Compare two files"
@ -106,36 +140,106 @@ actionOption =
    ]

 adiffOptions :: Parser ADiffOptions
-adiffOptions = ADiffOptions <$> redfaOption <*> actionOption
+adiffOptions = ADiffOptions <$> tokOptions <*> actionOptions

-loadToks :: RedfaSpec -> FilePath -> IO TV
-loadToks redfa f =
-  V.fromList <$> (mmapFileByteString f Nothing >>= redfaTokenize redfa)
+-- TODO: load in case it's not a regular file
+loadToks :: TokOpts -> FilePath -> IO TV
+loadToks topt f =
+  V.fromList <$> (mmapFileByteString f Nothing >>= tokenize topt f)

-main :: IO ()
-main =
+doDiff :: TokOpts -> ADiffCommandOpts -> IO ()
+doDiff topt (CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt}) = do
+  [toks1, toks2] <- traverse (loadToks topt) [f1, f2]
+  let output = hunks (max 0 ctxt) $ diffToks toks1 toks2
+  BB.hPutBuilder stdout $ pprHunks output
+  unless (null output) $ exitWith (ExitFailure 1)
+doDiff _ _ = error "dispatch failure"
+
+doDiff3 :: TokOpts -> ADiffCommandOpts -> IO ()
+doDiff3 topt (CmdDiff3 ctxt f1 f2 f3 mo) = do
+  [toksMine, toksOld, toksYour] <- traverse (loadToks topt) [f1, f2, f3]
+  let d3 = diff3Toks mo toksMine toksOld toksYour
+      isConflict (MineChanged, _) = True
+      isConflict (YourChanged, _) = True
+      isConflict _ = False
+      hasConflict = any isConflict d3
+  BB.hPutBuilder stdout $
+    if mergeDoMerge mo
+      then do
+        fmtMerged mo d3
+      else pprHunks $ hunks (max 0 ctxt) d3
+  when hasConflict $ exitWith (ExitFailure 1)
+doDiff3 _ _ = error "dispatch failure"
+
+doPatch :: TokOpts -> ADiffCommandOpts -> IO ()
+doPatch topt o = do
+  toksIn <- loadToks topt (patchInput o)
+  hs' <-
+    parsePatch <$>
+    case (patchInputPatch o) of
+      "-" -> B.getContents
+      fn -> B.readFile fn
+  hs <-
+    case hs' of
+      Left _ -> ioError $ userError "PATCHFILE parsing failed"
+      Right x -> pure x
+  let (toks, rej, warns) =
+        patchToks toksIn hs (patchReverse o) (patchMergeOpts o)
+      sus = not (null warns)
+      rewritingInput = null (patchOutput o)
+      outputStdout = patchOutput o == "-"
+      rejfile
+        | rewritingInput || outputStdout = patchInput o ++ ".rej"
+        | otherwise = patchOutput o ++ ".rej"
+      backupfile
+        | patchBackup o == "-" = ""
+        | patchBackup o == "" && rewritingInput = patchInput o ++ ".orig"
+        | otherwise = patchBackup o
+  traverse_ (hPutStrLn stderr . pprPatchWarn) warns
+  if patchDryRun o
+    then do
+      hPutStrLn stderr $
+        (if sus
+           then "OK"
+           else "Possibly problematic") ++
+        " patch with " ++ show (length rej :: Int) ++ " rejected hunks"
+      BB.hPutBuilder stdout (pprHunks rej)
+    else do
+      when (not $ null rej) $ BB.writeFile rejfile (pprHunks rej)
+      when (sus && not (null backupfile)) $
+        B.readFile (patchInput o) >>= B.writeFile backupfile
+      ($ mconcat $ map (BB.byteString . snd) $ V.toList toks) $
+        if outputStdout
+          then BB.hPutBuilder stdout
+          else BB.writeFile
+                 (if rewritingInput
+                    then patchInput o
+                    else patchOutput o)
+  when sus $ exitWith (ExitFailure 1)
+
+main' :: IO ()
+main' =
  let opts :: ParserInfo ADiffOptions
      opts =
        info
          (adiffOptions <**> versionOption "adiff" <**> helperOption)
          (fullDesc <>
           progDesc
-             "Compare, patch and merge files on arbitrarily-tokenized sequences." <>
+             "Compare, patch and merge files on arbitrarily tokenized sequences." <>
           header "adiff: arbitrary-token diff utilities")
-   in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
+   in do ADiffOptions {adiffTokOpts = topt, adiffCmdOpts = copt} <-
           customExecParser (prefs $ helpShowGlobals <> subparserInline) opts
-         redfa <- redfaPrepareRules ropt
-         case copt of
-           CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
-             [toks1, toks2] <- traverse (loadToks redfa) [f1, f2]
-             BB.hPutBuilder stdout $
-               pprHunks $ hunks (max 0 ctxt) $ diffToks toks1 toks2
-           CmdPatch {} -> putStrLn "not supported yet"
-           CmdDiff3 ctxt f1 f2 f3 mo -> do
-             [toksMine, toksOld, toksYour] <-
-               traverse (loadToks redfa) [f1, f2, f3]
-             let d3 = diff3Toks mo toksMine toksOld toksYour
-             BB.hPutBuilder stdout $
-               if mergeDoMerge mo
-                 then fmtMerged mo d3
-                 else pprHunks $ hunks (max 0 ctxt) d3
+         (case copt of
+            CmdDiff {} -> doDiff
+            CmdDiff3 {} -> doDiff3
+            CmdPatch {} -> doPatch)
+           topt
+           copt
+
+main :: IO ()
+main =
+  main' `catch`
+  (\e -> do
+     let err = show (e :: IOException)
+     hPutStrLn stderr err
+     exitWith $ ExitFailure 2)
--- a/src/Merge.hs
+++ b/src/Merge.hs
@ -33,7 +33,7 @@ mergeOption forPatch =
   switch
     (short 'a' <>
      long "add-linebreak" <>
-      help "Automatically add a line break after conflict markers") <*>
+      help "Automatically add a line break after conflict markers. Useful with `lines' lexer.") <*>
   mo)
  where
    marker = fromString . replicate 7
@ -44,7 +44,7 @@ mergeOption forPatch =
         long "merge" <>
         help
           (if forPatch
-              then "Merge using conflict markers instead of printing the rejected hunks"
+              then "Instead of printing the rejected thunks, merge using conflict markers as if the INPUT was `MYFILE' and the patch would produced `YOURFILE' from the original."
              else "Output the merged file instead of the patch")) <*>
      switch
        (short 'i' <>
@ -69,11 +69,11 @@ mergeOption forPatch =
         help
           ("On whitespace mismatch, output the version from " ++
            (if forPatch
-               then "original file"
+               then "the original file"
               else "MYFILE") ++
            " instead of the one from " ++
            (if forPatch
-               then "patch"
+               then "the context in patch"
               else "YOURFILE"))) <*>
      strOption
        (long "merge-start" <>
--- a/src/Patch.hs
+++ b/src/Patch.hs
@ -1,4 +1,30 @@
-module Patch where
+module Patch (patchToks, pprPatchWarn) where

-patchToks :: a
-patchToks = undefined
+import Format
+import Types
+import Merge
+import qualified Data.ByteString as B
+
+data PatchWarn = HunkMatched Hunk Int | HunkFailed Hunk
+
+pprPatchWarn :: PatchWarn -> String
+pprPatchWarn (HunkMatched ((o,n), _) at) = "hunk (-"++ show o ++ " +"++show n++") succeeded at "++show at
+pprPatchWarn (HunkFailed ((o,n),_)) = "hunk (-"++ show o ++ " +"++show n++") FAILED"
+
+patchToks :: TV -> [Hunk] -> Bool -> MergeOpts -> (TV, [Hunk], [PatchWarn])
+patchToks toks hunks rev mopt = undefined
+  --let (tokss, hunk, sus) = patchHunksAt 0 0 0 (if rev then map revHunk hunks else hunks) mopt
+  
+
+patchHunksAt :: TV -> Int -> Int -> Int -> [Hunk] -> MergeOpts -> ([TV], [PatchWarn])
+patchHunksAt toks tvoff origoff newoff hunks mopt = undefined
+
+hunkToMatch :: Diff -> Diff
+hunkToMatch = filter $ (`elem` [Keep, Remove, Original]) . fst
+
+hunkToReplace :: Diff -> Diff
+hunkToReplace = filter $ (`elem` [Keep, Add, MineChanged, YourChanged]) . fst
+
+tokCmp :: Bool -> Tok -> Tok -> Bool
+tokCmp True (False, _) (False, _) = True -- ignore the whitespace change
+tokCmp _ a b = a == b -- otherwise just compare
--- a/src/Redfa.hs
+++ b/src/Redfa.hs
@ -1,177 +0,0 @@
-{-# LANGUAGE FlexibleContexts #-}
-
-module Redfa where
-
-import Control.Monad
-import qualified Data.ByteString as B
-import qualified Data.ByteString.Builder as BB
-import qualified Data.ByteString.Char8 as B8
-import qualified Data.ByteString.Internal as BI
-import qualified Data.ByteString.Lazy as BL
-import Data.ByteString.UTF8 (fromString, toString)
-import Data.List
-import Data.Maybe
-import qualified Data.Vector as V
-import Options.Applicative
-import Substr
-import Text.Regex.TDFA
-import Types
-
-data RedfaOption
-  = RedfaOptionRules [BS]
-  | RedfaOptionFile String
-  deriving (Show)
-
-data RedfaSpec =
-  RedfaSpec
-    { redfaStart :: Int
-    , redfaNames :: V.Vector BS
-    , redfaRules :: V.Vector [RedfaRule] --start, mid, jump, token
-    }
-
-data RedfaRule =
-  RedfaRule
-    { rrRegexStart :: Regex
-    , rrRegexMid :: Regex
-    , rrJump :: Int
-    , rrIsToken :: Bool
-    }
-
-redfaOption :: Parser RedfaOption
-redfaOption =
-  RedfaOptionRules <$>
-  some
-    (strOption $
-     metavar "RULE" <>
-     short 'L' <>
-     long "lex" <> help "Lexing rule (specify repeatedly for more rules)") <|>
-  RedfaOptionFile <$>
-  strOption
-    (metavar "FILE" <>
-     short 'l' <>
-     long "lexer" <> help "File from where to load the lexing rules")
-
-redfaOptionToRuleStrings :: RedfaOption -> IO [BS]
-redfaOptionToRuleStrings (RedfaOptionRules x) = return x
-redfaOptionToRuleStrings (RedfaOptionFile fn) =
-  B8.lines <$> B.readFile fn -- TODO improve
-
-splitFirst :: Char -> BS -> (BS, BS)
-splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
-
-redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
-redfaRuleStringToRuleStr s =
-  let (spec, regex) = splitFirst ':' s
-      (from, to) = splitFirst '>' spec
-      sf = B8.strip from
-      (cleanFrom, isToken)
-        | B.null sf = (sf, True)
-        | B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
-        | otherwise = (sf, True)
-      go
-        | B.null s = Nothing
-        | B.head s == 35 = Nothing -- # comment
-        | B.null regex = Just (B.empty, B.empty, spec, isToken)
-        | B.null to = Just (cleanFrom, cleanFrom, B.tail regex, isToken)
-        | otherwise =
-          Just (cleanFrom, B8.strip $ B.tail to, B.tail regex, isToken)
-   in go
-
-unescapeRegex :: MonadFail m => BS -> m BS
-unescapeRegex s' = BL.toStrict . BB.toLazyByteString <$> unescape' s'
-  where
-    unescape' :: MonadFail m => BS -> m BB.Builder
-    unescape' s
-      | B.null s = return mempty
-      | B.head s == BI.c2w '\\' && B.null (B.tail s) =
-        fail "incomplete escape sequence"
-      | B.head s == BI.c2w '\\' =
-        let rest = B.tail s
-            cc = B.head rest
-            thechar =
-              BB.stringUtf8 $
-              case BI.w2c cc of
-                '`' -> "\\`"
-                '\'' -> "\\'"
-                'b' -> "\\b"
-                'B' -> "\\B"
-                '<' -> "\\<"
-                '>' -> "\\>"
-                'a' -> "\a"
-                'e' -> "\x1b"
-                'f' -> "\f"
-                'n' -> "\n"
-                'r' -> "\r"
-                't' -> "\t"
-                '\\' -> "\\\\"
-                a -> [a] --TODO add support for \x and \u
-         in (thechar <>) <$> unescape' (B.tail rest)
-      | otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
-
-redfaPrepareRules :: RedfaOption -> IO RedfaSpec
-redfaPrepareRules opt = do
-  (states, jumps, regexes, isToken) <-
-    unzip4 . mapMaybe redfaRuleStringToRuleStr <$> redfaOptionToRuleStrings opt
-  uRegexes <- traverse unescapeRegex regexes
-  startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
-  midREs <- traverse (makeRegexM . (fromString "\\`(.|\n)" <>)) uRegexes
-  let ids = nub . sort $ states ++ jumps
-      index' = fromJust . flip elemIndex ids
-      statesIds = map index' states
-      jumpsIds = map index' jumps
-      start = index' $ head states
-      rules :: [[RedfaRule]]
-      rules = do
-        stateId <- [0 .. length ids - 1]
-        return $ do
-          (a, b, (rs, rm), t) <-
-            zip4 statesIds jumpsIds (zip startREs midREs) isToken
-          guard $ a == stateId
-          return $ RedfaRule rs rm b t
-  return $ RedfaSpec start (V.fromList ids) (V.fromList rules)
-
-redfaTokenize :: MonadFail m => RedfaSpec -> BS -> m [Tok]
-redfaTokenize spec s = redfaTokenize' spec s (redfaStart spec) 0 []
-
-redfaTokenize' ::
-     MonadFail m => RedfaSpec -> BS -> Int -> Int -> [Int] -> m [Tok]
-redfaTokenize' spec s state off visited
-  | off >= B.length s = pure []
-  | otherwise =
-    let (ooff, reg) =
-          if off == 0
-            then (0, rrRegexStart)
-            else (1, rrRegexMid)
-        matchString = B.drop (off - ooff) s
-        matches :: [(RedfaRule, (MatchOffset, MatchLength))]
-        matches =
-          filter contOK $
-          zip rules $ map (\x -> match (reg x) matchString) rules
-        contOK (RedfaRule {rrJump = j}, (off', len))
-          | off' /= 0 = False
-          | len > ooff = True
-          | otherwise = j `notElem` visited
-     in case matches of
-          [] ->
-            fail $
-            "Tokenization could not continue from " <>
-            stateStrPretty <> " at offset " <> show (off + ooff)
-          ((rule, (_, len)):_) ->
-            let matchLen = len - ooff
-             in (if matchLen > 0
-                   then (:) (rrIsToken rule, substr off matchLen s)
-                   else id) <$>
-                redfaTokenize'
-                  spec
-                  s
-                  (rrJump rule)
-                  (off + matchLen)
-                  (if matchLen > 0
-                     then []
-                     else state : visited)
-  where
-    rules = redfaRules spec V.! state
-    stateStr = redfaNames spec V.! state
-    stateStrPretty
-      | B.null stateStr = "anonymous state"
-      | otherwise = "state `" <> toString stateStr <> "'"
--- a/src/Substr.hs
+++ b/src/Substr.hs
@ -1,7 +0,0 @@
-module Substr where
-
-import qualified Data.ByteString as B
-import Types
-
-substr :: Int -> Int -> BS -> BS
-substr b l = B.take l . B.drop b
--- a/src/Tokenizers.hs
+++ b/src/Tokenizers.hs
@ -0,0 +1,114 @@
+{-# LANGUAGE Rank2Types #-}
+{-# LANGUAGE TupleSections #-}
+
+module Tokenizers (TokOpts, tokOptions, tokenize) where
+
+import qualified Data.ByteString as B
+import qualified Data.ByteString.UTF8 as BU8
+import qualified Data.Word8 as W8
+import Options.Applicative
+import Types
+import qualified Unicode.Char.General as UC
+
+data TokOpts =
+  TokOpts
+    { optTokenizerName :: String
+    }
+  deriving (Show)
+
+tokOptions :: Parser TokOpts
+tokOptions =
+  TokOpts <$>
+  strOption
+    (metavar "LEXER" <>
+     short 'l' <>
+     long "lexer" <>
+     value "text" <>
+     help "Lexer name (use --help-lexers to list available ones)")
+
+--TODO this should later allow choosing the lexer by suffix
+tokenize :: MonadFail m => TokOpts -> FilePath -> BS -> m [Tok]
+tokenize topt _ =
+  case filter ((== optTokenizerName topt) . tkName) tokenizers of
+    [t] -> runTokenizer t
+    _ -> const $ fail "Lexer not found"
+
+data Tokenizer =
+  Tokenizer
+    { tkName :: String
+    , tkDescription :: String
+    , runTokenizer :: forall m. MonadFail m =>
+                                  BS -> m [Tok]
+    }
+
+tokenizers :: [Tokenizer]
+tokenizers =
+  [ Tokenizer "lines" "works like the traditional diff" tokenizeLines
+  , Tokenizer "asciiwords" "separate by ASCII whitespace" tokenizeWords8
+  , Tokenizer "words" "separate on any UTF8 spacing" tokenizeWords
+  , Tokenizer
+      "text"
+      "separate groups of similar-class UTF8 characters"
+      tokenizeUnicode
+  ]
+
+tokenizeLines :: MonadFail m => BS -> m [Tok]
+tokenizeLines = pure . map (True, ) . BU8.lines'
+
+tokenizeWords8 :: MonadFail m => BS -> m [Tok]
+tokenizeWords8 =
+  pure . map (\x -> (not $ W8.isSpace $ B.head x, x)) . bsGroupOn W8.isSpace
+  where
+    bsGroupOn f = B.groupBy (\l r -> f l == f r)
+
+tokenizeWords :: MonadFail m => BS -> m [Tok]
+tokenizeWords = pure . map makeTokenBU8 . groupOnBU8 UC.isWhiteSpace
+
+makeTokenBU8 :: BS -> Tok
+makeTokenBU8 s = if
+  maybe False (not . UC.isWhiteSpace . fst) (BU8.decode s)
+  then (True, s)
+  else (False, s)
+
+{- needed?
+foldri :: (Char -> Int -> a -> a) -> a -> BS -> a
+foldri cons nil cs =
+  case decode cs of
+    Just (a, l) -> cons a l (foldrS cons nil $ drop l cs)
+    Nothing -> nil
+-}
+
+spanBU8 :: (Char -> Bool) -> BS -> (BS, BS)
+spanBU8 f orig = B.splitAt (go 0 orig) orig
+  where
+    go :: Int -> BS -> Int
+    go n bs =
+      case BU8.decode bs of
+        Just (a, l) ->
+          if f a
+            then go (n + l) (B.drop l bs)
+            else n
+        Nothing -> n
+
+groupByBU8 :: (Char -> Char -> Bool) -> BS -> [BS]
+groupByBU8 f s =
+  case BU8.decode s of
+    Just (a, _) ->
+      let (g, s') = spanBU8 (f a) s
+       in g : groupByBU8 f s'
+    Nothing -> []
+
+groupOnBU8 :: Eq a => (Char -> a) -> BS -> [BS]
+groupOnBU8 f = groupByBU8 (\l r -> f l == f r)
+
+tokenizeUnicode :: MonadFail m => BS -> m [Tok]
+tokenizeUnicode = pure . map makeTokenBU8 . groupOnBU8 simpleUnicodeCategory
+
+simpleUnicodeCategory :: Char -> Int
+simpleUnicodeCategory c
+  | UC.isWhiteSpace c = 1
+  | UC.isAlphaNum c = 2
+  | UC.isSymbol c = 3
+  | UC.isPunctuation c = 4
+  | UC.isSeparator c = 5
+  | otherwise = 0
--- a/src/Types.hs
+++ b/src/Types.hs
@ -1,3 +1,5 @@
+--{-# LANGUAGE BangPatterns #-}
+
 module Types where

 import Data.ByteString