a bit of a revamp
This commit is contained in:
parent
441e2b4351
commit
6e2ab88961
26
README.md
26
README.md
|
@ -12,10 +12,14 @@ standard unix `diff` and related tools.
|
||||||
The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who
|
The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who
|
||||||
produced a working software package in C++, and designed the Regex-edged DFAs
|
produced a working software package in C++, and designed the Regex-edged DFAs
|
||||||
(REDFAs) that were used for user-specifiable tokenization of the input. The
|
(REDFAs) that were used for user-specifiable tokenization of the input. The
|
||||||
work on the corresponding thesis is currently ongoing.
|
work on the corresponding thesis is finished.
|
||||||
|
|
||||||
This started as a simple Haskell port of that work, and packed some relatively
|
This started as a simple Haskell port of that work, and packed some relatively
|
||||||
orthogonal improvements (mainly the histogram-style diffing).
|
orthogonal improvements (mainly the histogram-style diffing). I later got rid
|
||||||
|
of the REDFA concept -- while super-interesting and useful in theory, I didn't
|
||||||
|
find a sufficiently universal way to build good lexers from user-specified
|
||||||
|
strings. Having a proper Regex representation library (so that e.g.
|
||||||
|
reconstructing Flex is easy) would help a lot.
|
||||||
|
|
||||||
### TODO list
|
### TODO list
|
||||||
|
|
||||||
|
@ -23,16 +27,12 @@ orthogonal improvements (mainly the histogram-style diffing).
|
||||||
of hunk context. `diff` and `diff3` works.
|
of hunk context. `diff` and `diff3` works.
|
||||||
- Implement a splitting heuristic for diffs, so that diffing of large files
|
- Implement a splitting heuristic for diffs, so that diffing of large files
|
||||||
doesn't take aeons
|
doesn't take aeons
|
||||||
- Check whether REDFA can even be implemented correctly with current Haskell
|
- check if we can have external lexers, unix-style
|
||||||
libraries (most regex libraries target a completely different). Taking the
|
|
||||||
lexer specification format from `alex` currently seems like a much better
|
|
||||||
option. Deferring the task unix-ishly to another program could work too.
|
|
||||||
|
|
||||||
# How-To
|
# How-To
|
||||||
|
|
||||||
Install using `cabal`. The `adiff` program has 3 sub-commands that work like
|
Install using `cabal`. The `adiff` program has 3 sub-commands that work like
|
||||||
`diff`, `patch` and `diff3`. It expects a lexing specification on the input;
|
`diff`, `patch` and `diff3`.
|
||||||
there are several very simple example lexers in `lexers/`.
|
|
||||||
|
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
|
@ -49,15 +49,11 @@ Patching is hard. I still cannot rhyme.
|
||||||
```
|
```
|
||||||
|
|
||||||
Let's use the `words` lexer, which marks everything whitespace-ish as
|
Let's use the `words` lexer, which marks everything whitespace-ish as
|
||||||
whitespace, and picks up groups of non-whitespace "content" characters:
|
whitespace, and picks up groups of non-whitespace "content" characters.
|
||||||
```
|
|
||||||
:[^ \t\n]*
|
|
||||||
_:[ \t\n]*
|
|
||||||
```
|
|
||||||
|
|
||||||
Diffing the 2 files gets done as such:
|
Diffing the 2 files gets done as such:
|
||||||
```
|
```
|
||||||
$ cabal run adiff -- -l lexers/words diff orig mine
|
$ cabal run adiff -- -l words diff orig mine
|
||||||
```
|
```
|
||||||
|
|
||||||
You should get something like this:
|
You should get something like this:
|
||||||
|
@ -99,7 +95,7 @@ I cannot do verses.
|
||||||
We can run `diff3` to get a patch with both changes, optionally with reduced
|
We can run `diff3` to get a patch with both changes, optionally with reduced
|
||||||
context:
|
context:
|
||||||
```
|
```
|
||||||
$ cabal run adiff -- -l lexers/words diff3 mine orig yours -C1
|
$ cabal run adiff -- -l words diff3 mine orig yours -C1
|
||||||
```
|
```
|
||||||
...which outputs:
|
...which outputs:
|
||||||
```
|
```
|
||||||
|
|
11
adiff.cabal
11
adiff.cabal
|
@ -51,8 +51,7 @@ executable adiff
|
||||||
Hunks,
|
Hunks,
|
||||||
Merge,
|
Merge,
|
||||||
Patch,
|
Patch,
|
||||||
Redfa,
|
Tokenizers,
|
||||||
Substr,
|
|
||||||
Types,
|
Types,
|
||||||
Version
|
Version
|
||||||
|
|
||||||
|
@ -60,13 +59,15 @@ executable adiff
|
||||||
other-extensions: CPP
|
other-extensions: CPP
|
||||||
|
|
||||||
-- Other library packages from which modules are imported.
|
-- Other library packages from which modules are imported.
|
||||||
build-depends: base ^>=4.13.0.0,
|
build-depends: base ^>=4.15.0.0,
|
||||||
|
attoparsec ^>=0.14,
|
||||||
extra ^>= 1.7,
|
extra ^>= 1.7,
|
||||||
mmap ^>=0.5,
|
mmap ^>=0.5,
|
||||||
regex-tdfa ^>= 1.3,
|
|
||||||
optparse-applicative ^>=0.16,
|
optparse-applicative ^>=0.16,
|
||||||
bytestring ^>= 0.10.12,
|
bytestring ^>= 0.11.2,
|
||||||
vector ^>=0.12,
|
vector ^>=0.12,
|
||||||
|
word8 ^>=0.1,
|
||||||
|
unicode-data ^>=0.3,
|
||||||
utf8-string ^>=1.0
|
utf8-string ^>=1.0
|
||||||
|
|
||||||
-- Directories containing source files.
|
-- Directories containing source files.
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
[a-z]
|
|
||||||
_:[ \n]
|
|
|
@ -1,2 +0,0 @@
|
||||||
[^\n]*\n
|
|
||||||
[^\n]*
|
|
|
@ -1,2 +0,0 @@
|
||||||
[0-9]
|
|
||||||
_:\n
|
|
|
@ -1,2 +0,0 @@
|
||||||
:[^ \t\n]*
|
|
||||||
_:[ \t\n]*
|
|
28
src/Diff.hs
28
src/Diff.hs
|
@ -20,9 +20,9 @@ data DiffEnv =
|
||||||
, deB :: Int
|
, deB :: Int
|
||||||
, deVS :: V.Vector (Int, Int)
|
, deVS :: V.Vector (Int, Int)
|
||||||
, deVE :: V.Vector (Int, Int)
|
, deVE :: V.Vector (Int, Int)
|
||||||
|
, deTokPrio :: Tok -> Int
|
||||||
, deTrans :: Bool
|
, deTrans :: Bool
|
||||||
}
|
}
|
||||||
deriving (Show)
|
|
||||||
|
|
||||||
toksMatch :: Int -> Int -> DiffEnv -> Bool
|
toksMatch :: Int -> Int -> DiffEnv -> Bool
|
||||||
toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y
|
toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y
|
||||||
|
@ -47,10 +47,19 @@ stripEqToks t1 t2 = (pre, post, t1', t2')
|
||||||
t1' = V.slice b (l1 - e - b) t1
|
t1' = V.slice b (l1 - e - b) t1
|
||||||
t2' = V.slice b (l2 - e - b) t2
|
t2' = V.slice b (l2 - e - b) t2
|
||||||
|
|
||||||
|
makePrios :: TV -> TV -> (Bool, BS) -> Int
|
||||||
|
makePrios _ _ = get
|
||||||
|
where
|
||||||
|
get (isToken, str) =
|
||||||
|
if isToken
|
||||||
|
then B.length str
|
||||||
|
else 0
|
||||||
|
|
||||||
diffToks :: TV -> TV -> Diff
|
diffToks :: TV -> TV -> Diff
|
||||||
diffToks t1' t2' = pre ++ res ++ post
|
diffToks t1' t2' = pre ++ res ++ post
|
||||||
where
|
where
|
||||||
(pre, post, t1, t2) = stripEqToks t1' t2'
|
(pre, post, t1, t2) = stripEqToks t1' t2'
|
||||||
|
stats = makePrios t1' t2'
|
||||||
res
|
res
|
||||||
| V.null t1 = map (Add, ) (V.toList t2)
|
| V.null t1 = map (Add, ) (V.toList t2)
|
||||||
| V.null t2 = map (Remove, ) (V.toList t1)
|
| V.null t2 = map (Remove, ) (V.toList t1)
|
||||||
|
@ -67,6 +76,7 @@ diffToks t1' t2' = pre ++ res ++ post
|
||||||
, deB = V.length t2
|
, deB = V.length t2
|
||||||
, deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0)
|
, deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0)
|
||||||
, deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0)
|
, deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0)
|
||||||
|
, deTokPrio = stats
|
||||||
, deTrans = False
|
, deTrans = False
|
||||||
}
|
}
|
||||||
| otherwise =
|
| otherwise =
|
||||||
|
@ -82,6 +92,7 @@ diffToks t1' t2' = pre ++ res ++ post
|
||||||
, deB = V.length t1
|
, deB = V.length t1
|
||||||
, deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0)
|
, deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0)
|
||||||
, deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0)
|
, deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0)
|
||||||
|
, deTokPrio = stats
|
||||||
, deTrans = True
|
, deTrans = True
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,12 +121,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
||||||
mid = quot (s + e) 2
|
mid = quot (s + e) 2
|
||||||
vecSmid = vecS mid
|
vecSmid = vecS mid
|
||||||
vecEmid = vecE mid
|
vecEmid = vecE mid
|
||||||
extraScore i =
|
prio i = negate . deTokPrio de $ deT1 de V.! i
|
||||||
if isToken
|
|
||||||
then -(B.length str)
|
|
||||||
else 0
|
|
||||||
where
|
|
||||||
(isToken, str) = deT1 de V.! i
|
|
||||||
vecS = vec -- "forward" operation
|
vecS = vec -- "forward" operation
|
||||||
where
|
where
|
||||||
vec i
|
vec i
|
||||||
|
@ -131,7 +137,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
||||||
(iupleft, supleft) = v V.! pred j
|
(iupleft, supleft) = v V.! pred j
|
||||||
keep
|
keep
|
||||||
| toksMatch (pred i) (pred j) de =
|
| toksMatch (pred i) (pred j) de =
|
||||||
min (iupleft, supleft + extraScore (pred i))
|
min (iupleft, supleft + prio (pred i))
|
||||||
| otherwise = id
|
| otherwise = id
|
||||||
res = keep $ min (succ iup, sup) (succ ileft, sleft)
|
res = keep $ min (succ iup, sup) (succ ileft, sleft)
|
||||||
in res : go (succ j) res
|
in res : go (succ j) res
|
||||||
|
@ -150,7 +156,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
||||||
(idownright, sdownright) = v V.! succ j
|
(idownright, sdownright) = v V.! succ j
|
||||||
keep
|
keep
|
||||||
| toksMatch i j de =
|
| toksMatch i j de =
|
||||||
min (idownright, sdownright + extraScore i)
|
min (idownright, sdownright + prio i)
|
||||||
| otherwise = id
|
| otherwise = id
|
||||||
res = keep $ min (succ idown, sdown) (succ iright, sright)
|
res = keep $ min (succ idown, sdown) (succ iright, sright)
|
||||||
in res : go (pred j) res
|
in res : go (pred j) res
|
||||||
|
@ -213,7 +219,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
||||||
, fst (vecLS V.! i) == fst sCost - a + i
|
, fst (vecLS V.! i) == fst sCost - a + i
|
||||||
, sumL V.! i == totalCost
|
, sumL V.! i == totalCost
|
||||||
, if doKeep
|
, if doKeep
|
||||||
then scoreAdd (vecLS V.! i) (0, extraScore s) ==
|
then scoreAdd (vecLS V.! i) (0, prio s) ==
|
||||||
vecRS V.! succ i
|
vecRS V.! succ i
|
||||||
else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i
|
else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i
|
||||||
, if doKeep
|
, if doKeep
|
||||||
|
@ -223,7 +229,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
|
||||||
]
|
]
|
||||||
jumpEnd =
|
jumpEnd =
|
||||||
if doKeep
|
if doKeep
|
||||||
then jumpPos + 1
|
then succ jumpPos
|
||||||
else jumpPos
|
else jumpPos
|
||||||
in map
|
in map
|
||||||
(\i ->
|
(\i ->
|
||||||
|
|
|
@ -1,30 +1,39 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
|
||||||
|
|
||||||
module Format
|
module Format
|
||||||
( pprHunks
|
( pprHunks
|
||||||
, pprHunk
|
, pprHunk
|
||||||
, pprDiff1
|
, parsePatch
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Types
|
import Types
|
||||||
|
|
||||||
|
import Control.Applicative
|
||||||
|
import Control.Monad
|
||||||
|
import Data.Attoparsec.ByteString as A
|
||||||
import qualified Data.ByteString as B
|
import qualified Data.ByteString as B
|
||||||
import qualified Data.ByteString.Builder as BB
|
import qualified Data.ByteString.Builder as BB
|
||||||
import qualified Data.ByteString.Internal as BI
|
import qualified Data.ByteString.Internal as BI
|
||||||
|
import qualified Data.ByteString.Lazy as BL
|
||||||
import Data.String
|
import Data.String
|
||||||
|
import Data.Word8 as W8
|
||||||
|
|
||||||
pprHunkHdr :: Int -> Int -> BB.Builder
|
backslash :: Word8
|
||||||
pprHunkHdr i j = fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@"
|
backslash = BI.c2w '\\'
|
||||||
|
|
||||||
lineSep :: BB.Builder
|
newline :: Word8
|
||||||
lineSep = fromString "\n"
|
newline = BI.c2w '\n'
|
||||||
|
|
||||||
pprHunks :: [Hunk] -> BB.Builder
|
pprHunks :: [Hunk] -> BB.Builder
|
||||||
pprHunks = mconcat . map pprHunk
|
pprHunks = mconcat . map pprHunk
|
||||||
|
|
||||||
|
lineSep :: BB.Builder
|
||||||
|
lineSep = BB.word8 newline
|
||||||
|
|
||||||
|
pprHunkHdr :: Int -> Int -> BB.Builder
|
||||||
|
pprHunkHdr i j =
|
||||||
|
(fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@") <> lineSep
|
||||||
|
|
||||||
pprHunk :: Hunk -> BB.Builder
|
pprHunk :: Hunk -> BB.Builder
|
||||||
pprHunk ((i, j), toks) =
|
pprHunk ((i, j), toks) = mconcat (pprHunkHdr i j : map pprDiff1 toks)
|
||||||
mconcat ((pprHunkHdr i j <> lineSep) : map pprDiff1 toks)
|
|
||||||
|
|
||||||
pprDiff1 :: (Op, Tok) -> BB.Builder
|
pprDiff1 :: (Op, Tok) -> BB.Builder
|
||||||
pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
|
pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
|
||||||
|
@ -46,6 +55,74 @@ pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
|
||||||
escNewlines :: BS -> BB.Builder
|
escNewlines :: BS -> BB.Builder
|
||||||
escNewlines s
|
escNewlines s
|
||||||
| B.null s = mempty
|
| B.null s = mempty
|
||||||
| B.head s == BI.c2w '\n' = BB.stringUtf8 "\\n" <> escNewlines (B.tail s)
|
| B.head s == newline =
|
||||||
| B.head s == BI.c2w '\\' = BB.stringUtf8 "\\\\" <> escNewlines (B.tail s)
|
BB.word8 backslash <> BB.word8 (BI.c2w 'n') <> escNewlines (B.tail s)
|
||||||
|
| B.head s == backslash =
|
||||||
|
BB.word8 backslash <> BB.word8 backslash <> escNewlines (B.tail s)
|
||||||
| otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s)
|
| otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s)
|
||||||
|
|
||||||
|
--parsePatch :: BS -> Either String [Hunk]
|
||||||
|
parsePatch = parseOnly parseHunks
|
||||||
|
|
||||||
|
parseHunks :: Parser [Hunk]
|
||||||
|
parseHunks = many parseHunk <* endOfInput
|
||||||
|
|
||||||
|
parseHunk :: Parser Hunk
|
||||||
|
parseHunk = liftA2 (,) parseHunkHdr (many parseDiff1)
|
||||||
|
|
||||||
|
parseInt :: Parser Int
|
||||||
|
parseInt = read . map BI.w2c <$> some (satisfy W8.isDigit)
|
||||||
|
|
||||||
|
eol :: Parser ()
|
||||||
|
eol = void $ word8 newline
|
||||||
|
|
||||||
|
parseHunkHdr :: Parser (Int, Int)
|
||||||
|
parseHunkHdr = do
|
||||||
|
void . string $ fromString "@@ -"
|
||||||
|
i <- parseInt
|
||||||
|
void . string $ fromString " +"
|
||||||
|
j <- parseInt
|
||||||
|
void . string $ fromString " @@"
|
||||||
|
eol
|
||||||
|
return (i, j)
|
||||||
|
|
||||||
|
pairs2parsers :: [(a, Char)] -> [Parser a]
|
||||||
|
pairs2parsers = map (\(x, ch) -> x <$ word8 (BI.c2w ch))
|
||||||
|
|
||||||
|
parseOpList :: [Parser Op]
|
||||||
|
parseOpList =
|
||||||
|
pairs2parsers
|
||||||
|
[ (Add, '+')
|
||||||
|
, (Keep, ' ')
|
||||||
|
, (Remove, '-')
|
||||||
|
, (MineChanged, '<')
|
||||||
|
, (Original, '=')
|
||||||
|
, (YourChanged, '>')
|
||||||
|
]
|
||||||
|
|
||||||
|
parseOp :: Parser Op
|
||||||
|
parseOp = choice parseOpList
|
||||||
|
|
||||||
|
parseTokMarkList :: [Parser Bool]
|
||||||
|
parseTokMarkList = pairs2parsers [(True, '|'), (False, '.')]
|
||||||
|
|
||||||
|
parseTokMark :: Parser Bool
|
||||||
|
parseTokMark = choice parseTokMarkList
|
||||||
|
|
||||||
|
parseTokBS :: Parser BS
|
||||||
|
parseTokBS =
|
||||||
|
(BL.toStrict . BB.toLazyByteString . mconcat <$> many parseTokChar) <* eol
|
||||||
|
|
||||||
|
parseTokChar :: Parser BB.Builder
|
||||||
|
parseTokChar =
|
||||||
|
choice
|
||||||
|
[ BB.word8 newline <$ string (fromString "\\n")
|
||||||
|
, BB.word8 backslash <$ string (fromString "\\\\")
|
||||||
|
, BB.word8 <$> satisfy (\w -> w /= backslash && w /= newline)
|
||||||
|
]
|
||||||
|
|
||||||
|
parseTok :: Parser Tok
|
||||||
|
parseTok = liftA2 (,) parseTokMark parseTokBS
|
||||||
|
|
||||||
|
parseDiff1 :: Parser (Op, Tok)
|
||||||
|
parseDiff1 = liftA2 (,) parseOp parseTok
|
||||||
|
|
200
src/Main.hs
200
src/Main.hs
|
@ -1,6 +1,10 @@
|
||||||
module Main where
|
module Main where
|
||||||
|
|
||||||
|
import Control.Exception
|
||||||
|
import Control.Monad
|
||||||
|
import qualified Data.ByteString as B
|
||||||
import qualified Data.ByteString.Builder as BB
|
import qualified Data.ByteString.Builder as BB
|
||||||
|
import Data.Foldable (traverse_)
|
||||||
import qualified Data.Vector as V
|
import qualified Data.Vector as V
|
||||||
import Diff
|
import Diff
|
||||||
import Diff3
|
import Diff3
|
||||||
|
@ -8,15 +12,17 @@ import Format
|
||||||
import Hunks
|
import Hunks
|
||||||
import Merge
|
import Merge
|
||||||
import Options.Applicative
|
import Options.Applicative
|
||||||
import Redfa
|
import Patch
|
||||||
import System.IO (stdout)
|
import System.Exit
|
||||||
|
import System.IO (hPutStrLn, stderr, stdout)
|
||||||
import System.IO.MMap
|
import System.IO.MMap
|
||||||
|
import Tokenizers
|
||||||
import Types
|
import Types
|
||||||
import Version
|
import Version
|
||||||
|
|
||||||
data ADiffOptions =
|
data ADiffOptions =
|
||||||
ADiffOptions
|
ADiffOptions
|
||||||
{ adiffRedfaOpt :: RedfaOption
|
{ adiffTokOpts :: TokOpts
|
||||||
, adiffCmdOpts :: ADiffCommandOpts
|
, adiffCmdOpts :: ADiffCommandOpts
|
||||||
}
|
}
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
@ -29,11 +35,16 @@ data ADiffCommandOpts
|
||||||
}
|
}
|
||||||
| CmdPatch
|
| CmdPatch
|
||||||
{ patchDryRun :: Bool
|
{ patchDryRun :: Bool
|
||||||
, patchInDir :: Maybe String
|
--, patchInDir :: Maybe String
|
||||||
, patchInput :: String
|
--, patchPathStrip :: Int
|
||||||
|
, patchInputPatch :: String
|
||||||
|
, patchOutput :: String
|
||||||
|
, patchReject :: String --todo convert to Maybes with optional
|
||||||
|
, patchBackup :: String
|
||||||
, patchReverse :: Bool
|
, patchReverse :: Bool
|
||||||
, patchPathStrip :: Int
|
, context :: Int
|
||||||
, patchMergeOpts :: MergeOpts
|
, patchMergeOpts :: MergeOpts
|
||||||
|
, patchInput :: String
|
||||||
}
|
}
|
||||||
| CmdDiff3
|
| CmdDiff3
|
||||||
{ context :: Int
|
{ context :: Int
|
||||||
|
@ -44,15 +55,22 @@ data ADiffCommandOpts
|
||||||
}
|
}
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
contextOpt :: Parser Int
|
contextOpt :: Bool -> Parser Int
|
||||||
contextOpt =
|
contextOpt forPatch =
|
||||||
check <$>
|
check <$>
|
||||||
option
|
option
|
||||||
auto
|
auto
|
||||||
(metavar "CONTEXT" <>
|
(metavar "CONTEXT" <>
|
||||||
short 'C' <>
|
short 'C' <>
|
||||||
long "context" <>
|
long "context" <>
|
||||||
value 5 <> help "How many tokens around the change to include in the patch")
|
value
|
||||||
|
(if forPatch
|
||||||
|
then 4
|
||||||
|
else 8) <>
|
||||||
|
help
|
||||||
|
(if forPatch
|
||||||
|
then "Minimum amount of context tokens that must match so that the hunk is applied"
|
||||||
|
else "How many tokens around the change to include in the patch"))
|
||||||
where
|
where
|
||||||
check c
|
check c
|
||||||
| c < 0 = error "Negative context"
|
| c < 0 = error "Negative context"
|
||||||
|
@ -60,7 +78,7 @@ contextOpt =
|
||||||
|
|
||||||
diffCmdOptions :: Parser ADiffCommandOpts
|
diffCmdOptions :: Parser ADiffCommandOpts
|
||||||
diffCmdOptions =
|
diffCmdOptions =
|
||||||
CmdDiff <$> contextOpt <*> strArgument (metavar "FROMFILE") <*>
|
CmdDiff <$> contextOpt False <*> strArgument (metavar "FROMFILE") <*>
|
||||||
strArgument (metavar "TOFILE")
|
strArgument (metavar "TOFILE")
|
||||||
|
|
||||||
patchCmdOptions :: Parser ADiffCommandOpts
|
patchCmdOptions :: Parser ADiffCommandOpts
|
||||||
|
@ -70,33 +88,49 @@ patchCmdOptions =
|
||||||
(short 'n' <>
|
(short 'n' <>
|
||||||
long "dry-run" <>
|
long "dry-run" <>
|
||||||
help "Do not patch anything, just print what would happen") <*>
|
help "Do not patch anything, just print what would happen") <*>
|
||||||
optional
|
-- optional (strOption $ short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
|
||||||
(strOption $
|
--option auto (short 'p' <> long "strip" <> metavar "NUM" <> help "Strip NUM leading components from the paths" <> value 0) <*>
|
||||||
short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
|
|
||||||
strOption
|
strOption
|
||||||
(short 'i' <>
|
(short 'i' <>
|
||||||
long "input" <>
|
long "input" <>
|
||||||
metavar "INPUT" <>
|
metavar "PATCHFILE" <>
|
||||||
help "Read the patchfile from INPUT, defaults to `-' for STDIN" <>
|
help "Read the patchfile from PATCHFILE, defaults to `-' for STDIN" <>
|
||||||
value "-") <*>
|
value "-") <*>
|
||||||
|
strOption
|
||||||
|
(short 'o' <>
|
||||||
|
long "output" <>
|
||||||
|
metavar "OUTPUT" <>
|
||||||
|
help
|
||||||
|
"Write the patched file to OUTPUT, use `-' for STDOUT. By default, INPUT is rewritten." <>
|
||||||
|
value "") <*>
|
||||||
|
strOption
|
||||||
|
(short 'r' <>
|
||||||
|
long "reject" <>
|
||||||
|
metavar "REJECTS" <>
|
||||||
|
help
|
||||||
|
"Write the rejected hunks file to file REJECTS, instead of default `OUTPUT.rej'. Use `-' to discard rejects." <>
|
||||||
|
value "") <*>
|
||||||
|
strOption
|
||||||
|
(short 'b' <>
|
||||||
|
long "backup" <>
|
||||||
|
metavar "BACKUP" <>
|
||||||
|
help
|
||||||
|
"When rewriting INPUT after a partially applied or otherwise suspicious patch, back up the original file in BACKUP instead of default `INPUT.orig'. Use `-' to discard backups." <>
|
||||||
|
value "") <*>
|
||||||
switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*>
|
switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*>
|
||||||
option
|
contextOpt True <*>
|
||||||
auto
|
mergeOption True <*>
|
||||||
(short 'p' <>
|
strArgument (metavar "INPUT")
|
||||||
long "strip" <>
|
|
||||||
metavar "NUM" <>
|
|
||||||
help "Strip NUM leading components from the paths" <> value 0) <*>
|
|
||||||
mergeOption True
|
|
||||||
|
|
||||||
diff3CmdOptions :: Parser ADiffCommandOpts
|
diff3CmdOptions :: Parser ADiffCommandOpts
|
||||||
diff3CmdOptions =
|
diff3CmdOptions =
|
||||||
CmdDiff3 <$> contextOpt <*> strArgument (metavar "MYFILE") <*>
|
CmdDiff3 <$> contextOpt False <*> strArgument (metavar "MYFILE") <*>
|
||||||
strArgument (metavar "OLDFILE") <*>
|
strArgument (metavar "OLDFILE") <*>
|
||||||
strArgument (metavar "YOURFILE") <*>
|
strArgument (metavar "YOURFILE") <*>
|
||||||
mergeOption False
|
mergeOption False
|
||||||
|
|
||||||
actionOption :: Parser ADiffCommandOpts
|
actionOptions :: Parser ADiffCommandOpts
|
||||||
actionOption =
|
actionOptions =
|
||||||
hsubparser $
|
hsubparser $
|
||||||
mconcat
|
mconcat
|
||||||
[ command "diff" $ info diffCmdOptions $ progDesc "Compare two files"
|
[ command "diff" $ info diffCmdOptions $ progDesc "Compare two files"
|
||||||
|
@ -106,36 +140,106 @@ actionOption =
|
||||||
]
|
]
|
||||||
|
|
||||||
adiffOptions :: Parser ADiffOptions
|
adiffOptions :: Parser ADiffOptions
|
||||||
adiffOptions = ADiffOptions <$> redfaOption <*> actionOption
|
adiffOptions = ADiffOptions <$> tokOptions <*> actionOptions
|
||||||
|
|
||||||
loadToks :: RedfaSpec -> FilePath -> IO TV
|
-- TODO: load in case it's not a regular file
|
||||||
loadToks redfa f =
|
loadToks :: TokOpts -> FilePath -> IO TV
|
||||||
V.fromList <$> (mmapFileByteString f Nothing >>= redfaTokenize redfa)
|
loadToks topt f =
|
||||||
|
V.fromList <$> (mmapFileByteString f Nothing >>= tokenize topt f)
|
||||||
|
|
||||||
main :: IO ()
|
doDiff :: TokOpts -> ADiffCommandOpts -> IO ()
|
||||||
main =
|
doDiff topt (CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt}) = do
|
||||||
|
[toks1, toks2] <- traverse (loadToks topt) [f1, f2]
|
||||||
|
let output = hunks (max 0 ctxt) $ diffToks toks1 toks2
|
||||||
|
BB.hPutBuilder stdout $ pprHunks output
|
||||||
|
unless (null output) $ exitWith (ExitFailure 1)
|
||||||
|
doDiff _ _ = error "dispatch failure"
|
||||||
|
|
||||||
|
doDiff3 :: TokOpts -> ADiffCommandOpts -> IO ()
|
||||||
|
doDiff3 topt (CmdDiff3 ctxt f1 f2 f3 mo) = do
|
||||||
|
[toksMine, toksOld, toksYour] <- traverse (loadToks topt) [f1, f2, f3]
|
||||||
|
let d3 = diff3Toks mo toksMine toksOld toksYour
|
||||||
|
isConflict (MineChanged, _) = True
|
||||||
|
isConflict (YourChanged, _) = True
|
||||||
|
isConflict _ = False
|
||||||
|
hasConflict = any isConflict d3
|
||||||
|
BB.hPutBuilder stdout $
|
||||||
|
if mergeDoMerge mo
|
||||||
|
then do
|
||||||
|
fmtMerged mo d3
|
||||||
|
else pprHunks $ hunks (max 0 ctxt) d3
|
||||||
|
when hasConflict $ exitWith (ExitFailure 1)
|
||||||
|
doDiff3 _ _ = error "dispatch failure"
|
||||||
|
|
||||||
|
doPatch :: TokOpts -> ADiffCommandOpts -> IO ()
|
||||||
|
doPatch topt o = do
|
||||||
|
toksIn <- loadToks topt (patchInput o)
|
||||||
|
hs' <-
|
||||||
|
parsePatch <$>
|
||||||
|
case (patchInputPatch o) of
|
||||||
|
"-" -> B.getContents
|
||||||
|
fn -> B.readFile fn
|
||||||
|
hs <-
|
||||||
|
case hs' of
|
||||||
|
Left _ -> ioError $ userError "PATCHFILE parsing failed"
|
||||||
|
Right x -> pure x
|
||||||
|
let (toks, rej, warns) =
|
||||||
|
patchToks toksIn hs (patchReverse o) (patchMergeOpts o)
|
||||||
|
sus = not (null warns)
|
||||||
|
rewritingInput = null (patchOutput o)
|
||||||
|
outputStdout = patchOutput o == "-"
|
||||||
|
rejfile
|
||||||
|
| rewritingInput || outputStdout = patchInput o ++ ".rej"
|
||||||
|
| otherwise = patchOutput o ++ ".rej"
|
||||||
|
backupfile
|
||||||
|
| patchBackup o == "-" = ""
|
||||||
|
| patchBackup o == "" && rewritingInput = patchInput o ++ ".orig"
|
||||||
|
| otherwise = patchBackup o
|
||||||
|
traverse_ (hPutStrLn stderr . pprPatchWarn) warns
|
||||||
|
if patchDryRun o
|
||||||
|
then do
|
||||||
|
hPutStrLn stderr $
|
||||||
|
(if sus
|
||||||
|
then "OK"
|
||||||
|
else "Possibly problematic") ++
|
||||||
|
" patch with " ++ show (length rej :: Int) ++ " rejected hunks"
|
||||||
|
BB.hPutBuilder stdout (pprHunks rej)
|
||||||
|
else do
|
||||||
|
when (not $ null rej) $ BB.writeFile rejfile (pprHunks rej)
|
||||||
|
when (sus && not (null backupfile)) $
|
||||||
|
B.readFile (patchInput o) >>= B.writeFile backupfile
|
||||||
|
($ mconcat $ map (BB.byteString . snd) $ V.toList toks) $
|
||||||
|
if outputStdout
|
||||||
|
then BB.hPutBuilder stdout
|
||||||
|
else BB.writeFile
|
||||||
|
(if rewritingInput
|
||||||
|
then patchInput o
|
||||||
|
else patchOutput o)
|
||||||
|
when sus $ exitWith (ExitFailure 1)
|
||||||
|
|
||||||
|
main' :: IO ()
|
||||||
|
main' =
|
||||||
let opts :: ParserInfo ADiffOptions
|
let opts :: ParserInfo ADiffOptions
|
||||||
opts =
|
opts =
|
||||||
info
|
info
|
||||||
(adiffOptions <**> versionOption "adiff" <**> helperOption)
|
(adiffOptions <**> versionOption "adiff" <**> helperOption)
|
||||||
(fullDesc <>
|
(fullDesc <>
|
||||||
progDesc
|
progDesc
|
||||||
"Compare, patch and merge files on arbitrarily-tokenized sequences." <>
|
"Compare, patch and merge files on arbitrarily tokenized sequences." <>
|
||||||
header "adiff: arbitrary-token diff utilities")
|
header "adiff: arbitrary-token diff utilities")
|
||||||
in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
|
in do ADiffOptions {adiffTokOpts = topt, adiffCmdOpts = copt} <-
|
||||||
customExecParser (prefs $ helpShowGlobals <> subparserInline) opts
|
customExecParser (prefs $ helpShowGlobals <> subparserInline) opts
|
||||||
redfa <- redfaPrepareRules ropt
|
(case copt of
|
||||||
case copt of
|
CmdDiff {} -> doDiff
|
||||||
CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
|
CmdDiff3 {} -> doDiff3
|
||||||
[toks1, toks2] <- traverse (loadToks redfa) [f1, f2]
|
CmdPatch {} -> doPatch)
|
||||||
BB.hPutBuilder stdout $
|
topt
|
||||||
pprHunks $ hunks (max 0 ctxt) $ diffToks toks1 toks2
|
copt
|
||||||
CmdPatch {} -> putStrLn "not supported yet"
|
|
||||||
CmdDiff3 ctxt f1 f2 f3 mo -> do
|
main :: IO ()
|
||||||
[toksMine, toksOld, toksYour] <-
|
main =
|
||||||
traverse (loadToks redfa) [f1, f2, f3]
|
main' `catch`
|
||||||
let d3 = diff3Toks mo toksMine toksOld toksYour
|
(\e -> do
|
||||||
BB.hPutBuilder stdout $
|
let err = show (e :: IOException)
|
||||||
if mergeDoMerge mo
|
hPutStrLn stderr err
|
||||||
then fmtMerged mo d3
|
exitWith $ ExitFailure 2)
|
||||||
else pprHunks $ hunks (max 0 ctxt) d3
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ mergeOption forPatch =
|
||||||
switch
|
switch
|
||||||
(short 'a' <>
|
(short 'a' <>
|
||||||
long "add-linebreak" <>
|
long "add-linebreak" <>
|
||||||
help "Automatically add a line break after conflict markers") <*>
|
help "Automatically add a line break after conflict markers. Useful with `lines' lexer.") <*>
|
||||||
mo)
|
mo)
|
||||||
where
|
where
|
||||||
marker = fromString . replicate 7
|
marker = fromString . replicate 7
|
||||||
|
@ -44,7 +44,7 @@ mergeOption forPatch =
|
||||||
long "merge" <>
|
long "merge" <>
|
||||||
help
|
help
|
||||||
(if forPatch
|
(if forPatch
|
||||||
then "Merge using conflict markers instead of printing the rejected hunks"
|
then "Instead of printing the rejected thunks, merge using conflict markers as if the INPUT was `MYFILE' and the patch would produced `YOURFILE' from the original."
|
||||||
else "Output the merged file instead of the patch")) <*>
|
else "Output the merged file instead of the patch")) <*>
|
||||||
switch
|
switch
|
||||||
(short 'i' <>
|
(short 'i' <>
|
||||||
|
@ -69,11 +69,11 @@ mergeOption forPatch =
|
||||||
help
|
help
|
||||||
("On whitespace mismatch, output the version from " ++
|
("On whitespace mismatch, output the version from " ++
|
||||||
(if forPatch
|
(if forPatch
|
||||||
then "original file"
|
then "the original file"
|
||||||
else "MYFILE") ++
|
else "MYFILE") ++
|
||||||
" instead of the one from " ++
|
" instead of the one from " ++
|
||||||
(if forPatch
|
(if forPatch
|
||||||
then "patch"
|
then "the context in patch"
|
||||||
else "YOURFILE"))) <*>
|
else "YOURFILE"))) <*>
|
||||||
strOption
|
strOption
|
||||||
(long "merge-start" <>
|
(long "merge-start" <>
|
||||||
|
|
32
src/Patch.hs
32
src/Patch.hs
|
@ -1,4 +1,30 @@
|
||||||
module Patch where
|
module Patch (patchToks, pprPatchWarn) where
|
||||||
|
|
||||||
patchToks :: a
|
import Format
|
||||||
patchToks = undefined
|
import Types
|
||||||
|
import Merge
|
||||||
|
import qualified Data.ByteString as B
|
||||||
|
|
||||||
|
data PatchWarn = HunkMatched Hunk Int | HunkFailed Hunk
|
||||||
|
|
||||||
|
pprPatchWarn :: PatchWarn -> String
|
||||||
|
pprPatchWarn (HunkMatched ((o,n), _) at) = "hunk (-"++ show o ++ " +"++show n++") succeeded at "++show at
|
||||||
|
pprPatchWarn (HunkFailed ((o,n),_)) = "hunk (-"++ show o ++ " +"++show n++") FAILED"
|
||||||
|
|
||||||
|
patchToks :: TV -> [Hunk] -> Bool -> MergeOpts -> (TV, [Hunk], [PatchWarn])
|
||||||
|
patchToks toks hunks rev mopt = undefined
|
||||||
|
--let (tokss, hunk, sus) = patchHunksAt 0 0 0 (if rev then map revHunk hunks else hunks) mopt
|
||||||
|
|
||||||
|
|
||||||
|
patchHunksAt :: TV -> Int -> Int -> Int -> [Hunk] -> MergeOpts -> ([TV], [PatchWarn])
|
||||||
|
patchHunksAt toks tvoff origoff newoff hunks mopt = undefined
|
||||||
|
|
||||||
|
hunkToMatch :: Diff -> Diff
|
||||||
|
hunkToMatch = filter $ (`elem` [Keep, Remove, Original]) . fst
|
||||||
|
|
||||||
|
hunkToReplace :: Diff -> Diff
|
||||||
|
hunkToReplace = filter $ (`elem` [Keep, Add, MineChanged, YourChanged]) . fst
|
||||||
|
|
||||||
|
tokCmp :: Bool -> Tok -> Tok -> Bool
|
||||||
|
tokCmp True (False, _) (False, _) = True -- ignore the whitespace change
|
||||||
|
tokCmp _ a b = a == b -- otherwise just compare
|
||||||
|
|
177
src/Redfa.hs
177
src/Redfa.hs
|
@ -1,177 +0,0 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
|
||||||
|
|
||||||
module Redfa where
|
|
||||||
|
|
||||||
import Control.Monad
|
|
||||||
import qualified Data.ByteString as B
|
|
||||||
import qualified Data.ByteString.Builder as BB
|
|
||||||
import qualified Data.ByteString.Char8 as B8
|
|
||||||
import qualified Data.ByteString.Internal as BI
|
|
||||||
import qualified Data.ByteString.Lazy as BL
|
|
||||||
import Data.ByteString.UTF8 (fromString, toString)
|
|
||||||
import Data.List
|
|
||||||
import Data.Maybe
|
|
||||||
import qualified Data.Vector as V
|
|
||||||
import Options.Applicative
|
|
||||||
import Substr
|
|
||||||
import Text.Regex.TDFA
|
|
||||||
import Types
|
|
||||||
|
|
||||||
data RedfaOption
|
|
||||||
= RedfaOptionRules [BS]
|
|
||||||
| RedfaOptionFile String
|
|
||||||
deriving (Show)
|
|
||||||
|
|
||||||
data RedfaSpec =
|
|
||||||
RedfaSpec
|
|
||||||
{ redfaStart :: Int
|
|
||||||
, redfaNames :: V.Vector BS
|
|
||||||
, redfaRules :: V.Vector [RedfaRule] --start, mid, jump, token
|
|
||||||
}
|
|
||||||
|
|
||||||
data RedfaRule =
|
|
||||||
RedfaRule
|
|
||||||
{ rrRegexStart :: Regex
|
|
||||||
, rrRegexMid :: Regex
|
|
||||||
, rrJump :: Int
|
|
||||||
, rrIsToken :: Bool
|
|
||||||
}
|
|
||||||
|
|
||||||
redfaOption :: Parser RedfaOption
|
|
||||||
redfaOption =
|
|
||||||
RedfaOptionRules <$>
|
|
||||||
some
|
|
||||||
(strOption $
|
|
||||||
metavar "RULE" <>
|
|
||||||
short 'L' <>
|
|
||||||
long "lex" <> help "Lexing rule (specify repeatedly for more rules)") <|>
|
|
||||||
RedfaOptionFile <$>
|
|
||||||
strOption
|
|
||||||
(metavar "FILE" <>
|
|
||||||
short 'l' <>
|
|
||||||
long "lexer" <> help "File from where to load the lexing rules")
|
|
||||||
|
|
||||||
redfaOptionToRuleStrings :: RedfaOption -> IO [BS]
|
|
||||||
redfaOptionToRuleStrings (RedfaOptionRules x) = return x
|
|
||||||
redfaOptionToRuleStrings (RedfaOptionFile fn) =
|
|
||||||
B8.lines <$> B.readFile fn -- TODO improve
|
|
||||||
|
|
||||||
splitFirst :: Char -> BS -> (BS, BS)
|
|
||||||
splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
|
|
||||||
|
|
||||||
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
|
|
||||||
redfaRuleStringToRuleStr s =
|
|
||||||
let (spec, regex) = splitFirst ':' s
|
|
||||||
(from, to) = splitFirst '>' spec
|
|
||||||
sf = B8.strip from
|
|
||||||
(cleanFrom, isToken)
|
|
||||||
| B.null sf = (sf, True)
|
|
||||||
| B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
|
|
||||||
| otherwise = (sf, True)
|
|
||||||
go
|
|
||||||
| B.null s = Nothing
|
|
||||||
| B.head s == 35 = Nothing -- # comment
|
|
||||||
| B.null regex = Just (B.empty, B.empty, spec, isToken)
|
|
||||||
| B.null to = Just (cleanFrom, cleanFrom, B.tail regex, isToken)
|
|
||||||
| otherwise =
|
|
||||||
Just (cleanFrom, B8.strip $ B.tail to, B.tail regex, isToken)
|
|
||||||
in go
|
|
||||||
|
|
||||||
unescapeRegex :: MonadFail m => BS -> m BS
|
|
||||||
unescapeRegex s' = BL.toStrict . BB.toLazyByteString <$> unescape' s'
|
|
||||||
where
|
|
||||||
unescape' :: MonadFail m => BS -> m BB.Builder
|
|
||||||
unescape' s
|
|
||||||
| B.null s = return mempty
|
|
||||||
| B.head s == BI.c2w '\\' && B.null (B.tail s) =
|
|
||||||
fail "incomplete escape sequence"
|
|
||||||
| B.head s == BI.c2w '\\' =
|
|
||||||
let rest = B.tail s
|
|
||||||
cc = B.head rest
|
|
||||||
thechar =
|
|
||||||
BB.stringUtf8 $
|
|
||||||
case BI.w2c cc of
|
|
||||||
'`' -> "\\`"
|
|
||||||
'\'' -> "\\'"
|
|
||||||
'b' -> "\\b"
|
|
||||||
'B' -> "\\B"
|
|
||||||
'<' -> "\\<"
|
|
||||||
'>' -> "\\>"
|
|
||||||
'a' -> "\a"
|
|
||||||
'e' -> "\x1b"
|
|
||||||
'f' -> "\f"
|
|
||||||
'n' -> "\n"
|
|
||||||
'r' -> "\r"
|
|
||||||
't' -> "\t"
|
|
||||||
'\\' -> "\\\\"
|
|
||||||
a -> [a] --TODO add support for \x and \u
|
|
||||||
in (thechar <>) <$> unescape' (B.tail rest)
|
|
||||||
| otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
|
|
||||||
|
|
||||||
redfaPrepareRules :: RedfaOption -> IO RedfaSpec
|
|
||||||
redfaPrepareRules opt = do
|
|
||||||
(states, jumps, regexes, isToken) <-
|
|
||||||
unzip4 . mapMaybe redfaRuleStringToRuleStr <$> redfaOptionToRuleStrings opt
|
|
||||||
uRegexes <- traverse unescapeRegex regexes
|
|
||||||
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
|
|
||||||
midREs <- traverse (makeRegexM . (fromString "\\`(.|\n)" <>)) uRegexes
|
|
||||||
let ids = nub . sort $ states ++ jumps
|
|
||||||
index' = fromJust . flip elemIndex ids
|
|
||||||
statesIds = map index' states
|
|
||||||
jumpsIds = map index' jumps
|
|
||||||
start = index' $ head states
|
|
||||||
rules :: [[RedfaRule]]
|
|
||||||
rules = do
|
|
||||||
stateId <- [0 .. length ids - 1]
|
|
||||||
return $ do
|
|
||||||
(a, b, (rs, rm), t) <-
|
|
||||||
zip4 statesIds jumpsIds (zip startREs midREs) isToken
|
|
||||||
guard $ a == stateId
|
|
||||||
return $ RedfaRule rs rm b t
|
|
||||||
return $ RedfaSpec start (V.fromList ids) (V.fromList rules)
|
|
||||||
|
|
||||||
redfaTokenize :: MonadFail m => RedfaSpec -> BS -> m [Tok]
|
|
||||||
redfaTokenize spec s = redfaTokenize' spec s (redfaStart spec) 0 []
|
|
||||||
|
|
||||||
redfaTokenize' ::
|
|
||||||
MonadFail m => RedfaSpec -> BS -> Int -> Int -> [Int] -> m [Tok]
|
|
||||||
redfaTokenize' spec s state off visited
|
|
||||||
| off >= B.length s = pure []
|
|
||||||
| otherwise =
|
|
||||||
let (ooff, reg) =
|
|
||||||
if off == 0
|
|
||||||
then (0, rrRegexStart)
|
|
||||||
else (1, rrRegexMid)
|
|
||||||
matchString = B.drop (off - ooff) s
|
|
||||||
matches :: [(RedfaRule, (MatchOffset, MatchLength))]
|
|
||||||
matches =
|
|
||||||
filter contOK $
|
|
||||||
zip rules $ map (\x -> match (reg x) matchString) rules
|
|
||||||
contOK (RedfaRule {rrJump = j}, (off', len))
|
|
||||||
| off' /= 0 = False
|
|
||||||
| len > ooff = True
|
|
||||||
| otherwise = j `notElem` visited
|
|
||||||
in case matches of
|
|
||||||
[] ->
|
|
||||||
fail $
|
|
||||||
"Tokenization could not continue from " <>
|
|
||||||
stateStrPretty <> " at offset " <> show (off + ooff)
|
|
||||||
((rule, (_, len)):_) ->
|
|
||||||
let matchLen = len - ooff
|
|
||||||
in (if matchLen > 0
|
|
||||||
then (:) (rrIsToken rule, substr off matchLen s)
|
|
||||||
else id) <$>
|
|
||||||
redfaTokenize'
|
|
||||||
spec
|
|
||||||
s
|
|
||||||
(rrJump rule)
|
|
||||||
(off + matchLen)
|
|
||||||
(if matchLen > 0
|
|
||||||
then []
|
|
||||||
else state : visited)
|
|
||||||
where
|
|
||||||
rules = redfaRules spec V.! state
|
|
||||||
stateStr = redfaNames spec V.! state
|
|
||||||
stateStrPretty
|
|
||||||
| B.null stateStr = "anonymous state"
|
|
||||||
| otherwise = "state `" <> toString stateStr <> "'"
|
|
|
@ -1,7 +0,0 @@
|
||||||
module Substr where
|
|
||||||
|
|
||||||
import qualified Data.ByteString as B
|
|
||||||
import Types
|
|
||||||
|
|
||||||
substr :: Int -> Int -> BS -> BS
|
|
||||||
substr b l = B.take l . B.drop b
|
|
114
src/Tokenizers.hs
Normal file
114
src/Tokenizers.hs
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
{-# LANGUAGE Rank2Types #-}
|
||||||
|
{-# LANGUAGE TupleSections #-}
|
||||||
|
|
||||||
|
module Tokenizers (TokOpts, tokOptions, tokenize) where
|
||||||
|
|
||||||
|
import qualified Data.ByteString as B
|
||||||
|
import qualified Data.ByteString.UTF8 as BU8
|
||||||
|
import qualified Data.Word8 as W8
|
||||||
|
import Options.Applicative
|
||||||
|
import Types
|
||||||
|
import qualified Unicode.Char.General as UC
|
||||||
|
|
||||||
|
data TokOpts =
|
||||||
|
TokOpts
|
||||||
|
{ optTokenizerName :: String
|
||||||
|
}
|
||||||
|
deriving (Show)
|
||||||
|
|
||||||
|
tokOptions :: Parser TokOpts
|
||||||
|
tokOptions =
|
||||||
|
TokOpts <$>
|
||||||
|
strOption
|
||||||
|
(metavar "LEXER" <>
|
||||||
|
short 'l' <>
|
||||||
|
long "lexer" <>
|
||||||
|
value "text" <>
|
||||||
|
help "Lexer name (use --help-lexers to list available ones)")
|
||||||
|
|
||||||
|
--TODO this should later allow choosing the lexer by suffix
|
||||||
|
tokenize :: MonadFail m => TokOpts -> FilePath -> BS -> m [Tok]
|
||||||
|
tokenize topt _ =
|
||||||
|
case filter ((== optTokenizerName topt) . tkName) tokenizers of
|
||||||
|
[t] -> runTokenizer t
|
||||||
|
_ -> const $ fail "Lexer not found"
|
||||||
|
|
||||||
|
data Tokenizer =
|
||||||
|
Tokenizer
|
||||||
|
{ tkName :: String
|
||||||
|
, tkDescription :: String
|
||||||
|
, runTokenizer :: forall m. MonadFail m =>
|
||||||
|
BS -> m [Tok]
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizers :: [Tokenizer]
|
||||||
|
tokenizers =
|
||||||
|
[ Tokenizer "lines" "works like the traditional diff" tokenizeLines
|
||||||
|
, Tokenizer "asciiwords" "separate by ASCII whitespace" tokenizeWords8
|
||||||
|
, Tokenizer "words" "separate on any UTF8 spacing" tokenizeWords
|
||||||
|
, Tokenizer
|
||||||
|
"text"
|
||||||
|
"separate groups of similar-class UTF8 characters"
|
||||||
|
tokenizeUnicode
|
||||||
|
]
|
||||||
|
|
||||||
|
tokenizeLines :: MonadFail m => BS -> m [Tok]
|
||||||
|
tokenizeLines = pure . map (True, ) . BU8.lines'
|
||||||
|
|
||||||
|
tokenizeWords8 :: MonadFail m => BS -> m [Tok]
|
||||||
|
tokenizeWords8 =
|
||||||
|
pure . map (\x -> (not $ W8.isSpace $ B.head x, x)) . bsGroupOn W8.isSpace
|
||||||
|
where
|
||||||
|
bsGroupOn f = B.groupBy (\l r -> f l == f r)
|
||||||
|
|
||||||
|
tokenizeWords :: MonadFail m => BS -> m [Tok]
|
||||||
|
tokenizeWords = pure . map makeTokenBU8 . groupOnBU8 UC.isWhiteSpace
|
||||||
|
|
||||||
|
makeTokenBU8 :: BS -> Tok
|
||||||
|
makeTokenBU8 s = if
|
||||||
|
maybe False (not . UC.isWhiteSpace . fst) (BU8.decode s)
|
||||||
|
then (True, s)
|
||||||
|
else (False, s)
|
||||||
|
|
||||||
|
{- needed?
|
||||||
|
foldri :: (Char -> Int -> a -> a) -> a -> BS -> a
|
||||||
|
foldri cons nil cs =
|
||||||
|
case decode cs of
|
||||||
|
Just (a, l) -> cons a l (foldrS cons nil $ drop l cs)
|
||||||
|
Nothing -> nil
|
||||||
|
-}
|
||||||
|
|
||||||
|
spanBU8 :: (Char -> Bool) -> BS -> (BS, BS)
|
||||||
|
spanBU8 f orig = B.splitAt (go 0 orig) orig
|
||||||
|
where
|
||||||
|
go :: Int -> BS -> Int
|
||||||
|
go n bs =
|
||||||
|
case BU8.decode bs of
|
||||||
|
Just (a, l) ->
|
||||||
|
if f a
|
||||||
|
then go (n + l) (B.drop l bs)
|
||||||
|
else n
|
||||||
|
Nothing -> n
|
||||||
|
|
||||||
|
groupByBU8 :: (Char -> Char -> Bool) -> BS -> [BS]
|
||||||
|
groupByBU8 f s =
|
||||||
|
case BU8.decode s of
|
||||||
|
Just (a, _) ->
|
||||||
|
let (g, s') = spanBU8 (f a) s
|
||||||
|
in g : groupByBU8 f s'
|
||||||
|
Nothing -> []
|
||||||
|
|
||||||
|
groupOnBU8 :: Eq a => (Char -> a) -> BS -> [BS]
|
||||||
|
groupOnBU8 f = groupByBU8 (\l r -> f l == f r)
|
||||||
|
|
||||||
|
tokenizeUnicode :: MonadFail m => BS -> m [Tok]
|
||||||
|
tokenizeUnicode = pure . map makeTokenBU8 . groupOnBU8 simpleUnicodeCategory
|
||||||
|
|
||||||
|
simpleUnicodeCategory :: Char -> Int
|
||||||
|
simpleUnicodeCategory c
|
||||||
|
| UC.isWhiteSpace c = 1
|
||||||
|
| UC.isAlphaNum c = 2
|
||||||
|
| UC.isSymbol c = 3
|
||||||
|
| UC.isPunctuation c = 4
|
||||||
|
| UC.isSeparator c = 5
|
||||||
|
| otherwise = 0
|
|
@ -1,3 +1,5 @@
|
||||||
|
--{-# LANGUAGE BangPatterns #-}
|
||||||
|
|
||||||
module Types where
|
module Types where
|
||||||
|
|
||||||
import Data.ByteString
|
import Data.ByteString
|
||||||
|
|
Loading…
Reference in a new issue