a bit of a revamp

This commit is contained in:
Mirek Kratochvil 2022-05-25 08:30:09 +02:00
parent 441e2b4351
commit 6e2ab88961
15 changed files with 423 additions and 289 deletions

View file

@ -12,10 +12,14 @@ standard unix `diff` and related tools.
The problem was previously tackled by Arek Antoniewicz on MFF CUNI, who
produced a working software package in C++, and designed the Regex-edged DFAs
(REDFAs) that were used for user-specifiable tokenization of the input. The
work on the corresponding thesis is currently ongoing.
work on the corresponding thesis is finished.
This started as a simple Haskell port of that work, and packed some relatively
orthogonal improvements (mainly the histogram-style diffing).
orthogonal improvements (mainly the histogram-style diffing). I later got rid
of the REDFA concept -- while super-interesting and useful in theory, I didn't
find a sufficiently universal way to build good lexers from user-specified
strings. Having a proper Regex representation library (so that e.g.
reconstructing Flex is easy) would help a lot.
### TODO list
@ -23,16 +27,12 @@ orthogonal improvements (mainly the histogram-style diffing).
of hunk context. `diff` and `diff3` works.
- Implement a splitting heuristic for diffs, so that diffing of large files
doesn't take aeons
- Check whether REDFA can even be implemented correctly with current Haskell
libraries (most regex libraries target a completely different). Taking the
lexer specification format from `alex` currently seems like a much better
option. Deferring the task unix-ishly to another program could work too.
- check if we can have external lexers, unix-style
# How-To
Install using `cabal`. The `adiff` program has 3 sub-commands that work like
`diff`, `patch` and `diff3`. It expects a lexing specification on the input;
there are several very simple example lexers in `lexers/`.
`diff`, `patch` and `diff3`.
## Example
@ -49,15 +49,11 @@ Patching is hard. I still cannot rhyme.
```
Let's use the `words` lexer, which marks everything whitespace-ish as
whitespace, and picks up groups of non-whitespace "content" characters:
```
:[^ \t\n]*
_:[ \t\n]*
```
whitespace, and picks up groups of non-whitespace "content" characters.
Diffing the 2 files gets done as such:
```
$ cabal run adiff -- -l lexers/words diff orig mine
$ cabal run adiff -- -l words diff orig mine
```
You should get something like this:
@ -99,7 +95,7 @@ I cannot do verses.
We can run `diff3` to get a patch with both changes, optionally with reduced
context:
```
$ cabal run adiff -- -l lexers/words diff3 mine orig yours -C1
$ cabal run adiff -- -l words diff3 mine orig yours -C1
```
...which outputs:
```

View file

@ -51,8 +51,7 @@ executable adiff
Hunks,
Merge,
Patch,
Redfa,
Substr,
Tokenizers,
Types,
Version
@ -60,13 +59,15 @@ executable adiff
other-extensions: CPP
-- Other library packages from which modules are imported.
build-depends: base ^>=4.13.0.0,
build-depends: base ^>=4.15.0.0,
attoparsec ^>=0.14,
extra ^>= 1.7,
mmap ^>=0.5,
regex-tdfa ^>= 1.3,
optparse-applicative ^>=0.16,
bytestring ^>= 0.10.12,
bytestring ^>= 0.11.2,
vector ^>=0.12,
word8 ^>=0.1,
unicode-data ^>=0.3,
utf8-string ^>=1.0
-- Directories containing source files.

View file

@ -1,2 +0,0 @@
[a-z]
_:[ \n]

View file

@ -1,2 +0,0 @@
[^\n]*\n
[^\n]*

View file

@ -1,2 +0,0 @@
[0-9]
_:\n

View file

@ -1,2 +0,0 @@
:[^ \t\n]*
_:[ \t\n]*

View file

@ -20,9 +20,9 @@ data DiffEnv =
, deB :: Int
, deVS :: V.Vector (Int, Int)
, deVE :: V.Vector (Int, Int)
, deTokPrio :: Tok -> Int
, deTrans :: Bool
}
deriving (Show)
toksMatch :: Int -> Int -> DiffEnv -> Bool
toksMatch x y DiffEnv {deT1 = t1, deT2 = t2} = t1 V.! x == t2 V.! y
@ -47,10 +47,19 @@ stripEqToks t1 t2 = (pre, post, t1', t2')
t1' = V.slice b (l1 - e - b) t1
t2' = V.slice b (l2 - e - b) t2
makePrios :: TV -> TV -> (Bool, BS) -> Int
makePrios _ _ = get
where
get (isToken, str) =
if isToken
then B.length str
else 0
diffToks :: TV -> TV -> Diff
diffToks t1' t2' = pre ++ res ++ post
where
(pre, post, t1, t2) = stripEqToks t1' t2'
stats = makePrios t1' t2'
res
| V.null t1 = map (Add, ) (V.toList t2)
| V.null t2 = map (Remove, ) (V.toList t1)
@ -67,6 +76,7 @@ diffToks t1' t2' = pre ++ res ++ post
, deB = V.length t2
, deVS = V.fromList $ zip [0 .. V.length t2] (repeat 0)
, deVE = V.fromList $ reverse $ zip [0 .. V.length t2] (repeat 0)
, deTokPrio = stats
, deTrans = False
}
| otherwise =
@ -82,6 +92,7 @@ diffToks t1' t2' = pre ++ res ++ post
, deB = V.length t1
, deVS = V.fromList $ zip [0 .. V.length t1] (repeat 0)
, deVE = V.fromList $ reverse $ zip [0 .. V.length t1] (repeat 0)
, deTokPrio = stats
, deTrans = True
}
@ -110,12 +121,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
mid = quot (s + e) 2
vecSmid = vecS mid
vecEmid = vecE mid
extraScore i =
if isToken
then -(B.length str)
else 0
where
(isToken, str) = deT1 de V.! i
prio i = negate . deTokPrio de $ deT1 de V.! i
vecS = vec -- "forward" operation
where
vec i
@ -131,7 +137,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
(iupleft, supleft) = v V.! pred j
keep
| toksMatch (pred i) (pred j) de =
min (iupleft, supleft + extraScore (pred i))
min (iupleft, supleft + prio (pred i))
| otherwise = id
res = keep $ min (succ iup, sup) (succ ileft, sleft)
in res : go (succ j) res
@ -150,7 +156,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
(idownright, sdownright) = v V.! succ j
keep
| toksMatch i j de =
min (idownright, sdownright + extraScore i)
min (idownright, sdownright + prio i)
| otherwise = id
res = keep $ min (succ idown, sdown) (succ iright, sright)
in res : go (pred j) res
@ -213,7 +219,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
, fst (vecLS V.! i) == fst sCost - a + i
, sumL V.! i == totalCost
, if doKeep
then scoreAdd (vecLS V.! i) (0, extraScore s) ==
then scoreAdd (vecLS V.! i) (0, prio s) ==
vecRS V.! succ i
else scoreAdd (vecLS V.! i) (1, 0) == vecRS V.! i
, if doKeep
@ -223,7 +229,7 @@ diffToks' de@DiffEnv {deS = s, deE = e, deA = a, deB = b, deTrans = trans} =
]
jumpEnd =
if doKeep
then jumpPos + 1
then succ jumpPos
else jumpPos
in map
(\i ->

View file

@ -1,30 +1,39 @@
{-# LANGUAGE OverloadedStrings #-}
module Format
( pprHunks
, pprHunk
, pprDiff1
, parsePatch
) where
import Types
import Control.Applicative
import Control.Monad
import Data.Attoparsec.ByteString as A
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import Data.String
import Data.Word8 as W8
pprHunkHdr :: Int -> Int -> BB.Builder
pprHunkHdr i j = fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@"
backslash :: Word8
backslash = BI.c2w '\\'
lineSep :: BB.Builder
lineSep = fromString "\n"
newline :: Word8
newline = BI.c2w '\n'
pprHunks :: [Hunk] -> BB.Builder
pprHunks = mconcat . map pprHunk
lineSep :: BB.Builder
lineSep = BB.word8 newline
pprHunkHdr :: Int -> Int -> BB.Builder
pprHunkHdr i j =
(fromString $ "@@ -" ++ show i ++ " +" ++ show j ++ " @@") <> lineSep
pprHunk :: Hunk -> BB.Builder
pprHunk ((i, j), toks) =
mconcat ((pprHunkHdr i j <> lineSep) : map pprDiff1 toks)
pprHunk ((i, j), toks) = mconcat (pprHunkHdr i j : map pprDiff1 toks)
pprDiff1 :: (Op, Tok) -> BB.Builder
pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
@ -46,6 +55,74 @@ pprDiff1 (op, (tok, s)) = fromString pfx <> escNewlines s <> lineSep
escNewlines :: BS -> BB.Builder
escNewlines s
| B.null s = mempty
| B.head s == BI.c2w '\n' = BB.stringUtf8 "\\n" <> escNewlines (B.tail s)
| B.head s == BI.c2w '\\' = BB.stringUtf8 "\\\\" <> escNewlines (B.tail s)
| B.head s == newline =
BB.word8 backslash <> BB.word8 (BI.c2w 'n') <> escNewlines (B.tail s)
| B.head s == backslash =
BB.word8 backslash <> BB.word8 backslash <> escNewlines (B.tail s)
| otherwise = BB.word8 (B.head s) <> escNewlines (B.tail s)
--parsePatch :: BS -> Either String [Hunk]
parsePatch = parseOnly parseHunks
parseHunks :: Parser [Hunk]
parseHunks = many parseHunk <* endOfInput
parseHunk :: Parser Hunk
parseHunk = liftA2 (,) parseHunkHdr (many parseDiff1)
parseInt :: Parser Int
parseInt = read . map BI.w2c <$> some (satisfy W8.isDigit)
eol :: Parser ()
eol = void $ word8 newline
parseHunkHdr :: Parser (Int, Int)
parseHunkHdr = do
void . string $ fromString "@@ -"
i <- parseInt
void . string $ fromString " +"
j <- parseInt
void . string $ fromString " @@"
eol
return (i, j)
pairs2parsers :: [(a, Char)] -> [Parser a]
pairs2parsers = map (\(x, ch) -> x <$ word8 (BI.c2w ch))
parseOpList :: [Parser Op]
parseOpList =
pairs2parsers
[ (Add, '+')
, (Keep, ' ')
, (Remove, '-')
, (MineChanged, '<')
, (Original, '=')
, (YourChanged, '>')
]
parseOp :: Parser Op
parseOp = choice parseOpList
parseTokMarkList :: [Parser Bool]
parseTokMarkList = pairs2parsers [(True, '|'), (False, '.')]
parseTokMark :: Parser Bool
parseTokMark = choice parseTokMarkList
parseTokBS :: Parser BS
parseTokBS =
(BL.toStrict . BB.toLazyByteString . mconcat <$> many parseTokChar) <* eol
parseTokChar :: Parser BB.Builder
parseTokChar =
choice
[ BB.word8 newline <$ string (fromString "\\n")
, BB.word8 backslash <$ string (fromString "\\\\")
, BB.word8 <$> satisfy (\w -> w /= backslash && w /= newline)
]
parseTok :: Parser Tok
parseTok = liftA2 (,) parseTokMark parseTokBS
parseDiff1 :: Parser (Op, Tok)
parseDiff1 = liftA2 (,) parseOp parseTok

View file

@ -1,6 +1,10 @@
module Main where
import Control.Exception
import Control.Monad
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import Data.Foldable (traverse_)
import qualified Data.Vector as V
import Diff
import Diff3
@ -8,15 +12,17 @@ import Format
import Hunks
import Merge
import Options.Applicative
import Redfa
import System.IO (stdout)
import Patch
import System.Exit
import System.IO (hPutStrLn, stderr, stdout)
import System.IO.MMap
import Tokenizers
import Types
import Version
data ADiffOptions =
ADiffOptions
{ adiffRedfaOpt :: RedfaOption
{ adiffTokOpts :: TokOpts
, adiffCmdOpts :: ADiffCommandOpts
}
deriving (Show)
@ -29,11 +35,16 @@ data ADiffCommandOpts
}
| CmdPatch
{ patchDryRun :: Bool
, patchInDir :: Maybe String
, patchInput :: String
--, patchInDir :: Maybe String
--, patchPathStrip :: Int
, patchInputPatch :: String
, patchOutput :: String
, patchReject :: String --todo convert to Maybes with optional
, patchBackup :: String
, patchReverse :: Bool
, patchPathStrip :: Int
, context :: Int
, patchMergeOpts :: MergeOpts
, patchInput :: String
}
| CmdDiff3
{ context :: Int
@ -44,15 +55,22 @@ data ADiffCommandOpts
}
deriving (Show)
contextOpt :: Parser Int
contextOpt =
contextOpt :: Bool -> Parser Int
contextOpt forPatch =
check <$>
option
auto
(metavar "CONTEXT" <>
short 'C' <>
long "context" <>
value 5 <> help "How many tokens around the change to include in the patch")
value
(if forPatch
then 4
else 8) <>
help
(if forPatch
then "Minimum amount of context tokens that must match so that the hunk is applied"
else "How many tokens around the change to include in the patch"))
where
check c
| c < 0 = error "Negative context"
@ -60,7 +78,7 @@ contextOpt =
diffCmdOptions :: Parser ADiffCommandOpts
diffCmdOptions =
CmdDiff <$> contextOpt <*> strArgument (metavar "FROMFILE") <*>
CmdDiff <$> contextOpt False <*> strArgument (metavar "FROMFILE") <*>
strArgument (metavar "TOFILE")
patchCmdOptions :: Parser ADiffCommandOpts
@ -70,33 +88,49 @@ patchCmdOptions =
(short 'n' <>
long "dry-run" <>
help "Do not patch anything, just print what would happen") <*>
optional
(strOption $
short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
-- optional (strOption $ short 'd' <> long "directory" <> metavar "DIR" <> help "Work in DIR") <*>
--option auto (short 'p' <> long "strip" <> metavar "NUM" <> help "Strip NUM leading components from the paths" <> value 0) <*>
strOption
(short 'i' <>
long "input" <>
metavar "INPUT" <>
help "Read the patchfile from INPUT, defaults to `-' for STDIN" <>
metavar "PATCHFILE" <>
help "Read the patchfile from PATCHFILE, defaults to `-' for STDIN" <>
value "-") <*>
strOption
(short 'o' <>
long "output" <>
metavar "OUTPUT" <>
help
"Write the patched file to OUTPUT, use `-' for STDOUT. By default, INPUT is rewritten." <>
value "") <*>
strOption
(short 'r' <>
long "reject" <>
metavar "REJECTS" <>
help
"Write the rejected hunks file to file REJECTS, instead of default `OUTPUT.rej'. Use `-' to discard rejects." <>
value "") <*>
strOption
(short 'b' <>
long "backup" <>
metavar "BACKUP" <>
help
"When rewriting INPUT after a partially applied or otherwise suspicious patch, back up the original file in BACKUP instead of default `INPUT.orig'. Use `-' to discard backups." <>
value "") <*>
switch (short 'R' <> long "reverse" <> help "Unapply applied patches") <*>
option
auto
(short 'p' <>
long "strip" <>
metavar "NUM" <>
help "Strip NUM leading components from the paths" <> value 0) <*>
mergeOption True
contextOpt True <*>
mergeOption True <*>
strArgument (metavar "INPUT")
diff3CmdOptions :: Parser ADiffCommandOpts
diff3CmdOptions =
CmdDiff3 <$> contextOpt <*> strArgument (metavar "MYFILE") <*>
CmdDiff3 <$> contextOpt False <*> strArgument (metavar "MYFILE") <*>
strArgument (metavar "OLDFILE") <*>
strArgument (metavar "YOURFILE") <*>
mergeOption False
actionOption :: Parser ADiffCommandOpts
actionOption =
actionOptions :: Parser ADiffCommandOpts
actionOptions =
hsubparser $
mconcat
[ command "diff" $ info diffCmdOptions $ progDesc "Compare two files"
@ -106,36 +140,106 @@ actionOption =
]
adiffOptions :: Parser ADiffOptions
adiffOptions = ADiffOptions <$> redfaOption <*> actionOption
adiffOptions = ADiffOptions <$> tokOptions <*> actionOptions
loadToks :: RedfaSpec -> FilePath -> IO TV
loadToks redfa f =
V.fromList <$> (mmapFileByteString f Nothing >>= redfaTokenize redfa)
-- TODO: load in case it's not a regular file
loadToks :: TokOpts -> FilePath -> IO TV
loadToks topt f =
V.fromList <$> (mmapFileByteString f Nothing >>= tokenize topt f)
main :: IO ()
main =
doDiff :: TokOpts -> ADiffCommandOpts -> IO ()
doDiff topt (CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt}) = do
[toks1, toks2] <- traverse (loadToks topt) [f1, f2]
let output = hunks (max 0 ctxt) $ diffToks toks1 toks2
BB.hPutBuilder stdout $ pprHunks output
unless (null output) $ exitWith (ExitFailure 1)
doDiff _ _ = error "dispatch failure"
doDiff3 :: TokOpts -> ADiffCommandOpts -> IO ()
doDiff3 topt (CmdDiff3 ctxt f1 f2 f3 mo) = do
[toksMine, toksOld, toksYour] <- traverse (loadToks topt) [f1, f2, f3]
let d3 = diff3Toks mo toksMine toksOld toksYour
isConflict (MineChanged, _) = True
isConflict (YourChanged, _) = True
isConflict _ = False
hasConflict = any isConflict d3
BB.hPutBuilder stdout $
if mergeDoMerge mo
then do
fmtMerged mo d3
else pprHunks $ hunks (max 0 ctxt) d3
when hasConflict $ exitWith (ExitFailure 1)
doDiff3 _ _ = error "dispatch failure"
doPatch :: TokOpts -> ADiffCommandOpts -> IO ()
doPatch topt o = do
toksIn <- loadToks topt (patchInput o)
hs' <-
parsePatch <$>
case (patchInputPatch o) of
"-" -> B.getContents
fn -> B.readFile fn
hs <-
case hs' of
Left _ -> ioError $ userError "PATCHFILE parsing failed"
Right x -> pure x
let (toks, rej, warns) =
patchToks toksIn hs (patchReverse o) (patchMergeOpts o)
sus = not (null warns)
rewritingInput = null (patchOutput o)
outputStdout = patchOutput o == "-"
rejfile
| rewritingInput || outputStdout = patchInput o ++ ".rej"
| otherwise = patchOutput o ++ ".rej"
backupfile
| patchBackup o == "-" = ""
| patchBackup o == "" && rewritingInput = patchInput o ++ ".orig"
| otherwise = patchBackup o
traverse_ (hPutStrLn stderr . pprPatchWarn) warns
if patchDryRun o
then do
hPutStrLn stderr $
(if sus
then "OK"
else "Possibly problematic") ++
" patch with " ++ show (length rej :: Int) ++ " rejected hunks"
BB.hPutBuilder stdout (pprHunks rej)
else do
when (not $ null rej) $ BB.writeFile rejfile (pprHunks rej)
when (sus && not (null backupfile)) $
B.readFile (patchInput o) >>= B.writeFile backupfile
($ mconcat $ map (BB.byteString . snd) $ V.toList toks) $
if outputStdout
then BB.hPutBuilder stdout
else BB.writeFile
(if rewritingInput
then patchInput o
else patchOutput o)
when sus $ exitWith (ExitFailure 1)
main' :: IO ()
main' =
let opts :: ParserInfo ADiffOptions
opts =
info
(adiffOptions <**> versionOption "adiff" <**> helperOption)
(fullDesc <>
progDesc
"Compare, patch and merge files on arbitrarily-tokenized sequences." <>
"Compare, patch and merge files on arbitrarily tokenized sequences." <>
header "adiff: arbitrary-token diff utilities")
in do ADiffOptions {adiffRedfaOpt = ropt, adiffCmdOpts = copt} <-
in do ADiffOptions {adiffTokOpts = topt, adiffCmdOpts = copt} <-
customExecParser (prefs $ helpShowGlobals <> subparserInline) opts
redfa <- redfaPrepareRules ropt
case copt of
CmdDiff {diffFile1 = f1, diffFile2 = f2, context = ctxt} -> do
[toks1, toks2] <- traverse (loadToks redfa) [f1, f2]
BB.hPutBuilder stdout $
pprHunks $ hunks (max 0 ctxt) $ diffToks toks1 toks2
CmdPatch {} -> putStrLn "not supported yet"
CmdDiff3 ctxt f1 f2 f3 mo -> do
[toksMine, toksOld, toksYour] <-
traverse (loadToks redfa) [f1, f2, f3]
let d3 = diff3Toks mo toksMine toksOld toksYour
BB.hPutBuilder stdout $
if mergeDoMerge mo
then fmtMerged mo d3
else pprHunks $ hunks (max 0 ctxt) d3
(case copt of
CmdDiff {} -> doDiff
CmdDiff3 {} -> doDiff3
CmdPatch {} -> doPatch)
topt
copt
main :: IO ()
main =
main' `catch`
(\e -> do
let err = show (e :: IOException)
hPutStrLn stderr err
exitWith $ ExitFailure 2)

View file

@ -33,7 +33,7 @@ mergeOption forPatch =
switch
(short 'a' <>
long "add-linebreak" <>
help "Automatically add a line break after conflict markers") <*>
help "Automatically add a line break after conflict markers. Useful with `lines' lexer.") <*>
mo)
where
marker = fromString . replicate 7
@ -44,7 +44,7 @@ mergeOption forPatch =
long "merge" <>
help
(if forPatch
then "Merge using conflict markers instead of printing the rejected hunks"
then "Instead of printing the rejected thunks, merge using conflict markers as if the INPUT was `MYFILE' and the patch would produced `YOURFILE' from the original."
else "Output the merged file instead of the patch")) <*>
switch
(short 'i' <>
@ -69,11 +69,11 @@ mergeOption forPatch =
help
("On whitespace mismatch, output the version from " ++
(if forPatch
then "original file"
then "the original file"
else "MYFILE") ++
" instead of the one from " ++
(if forPatch
then "patch"
then "the context in patch"
else "YOURFILE"))) <*>
strOption
(long "merge-start" <>

View file

@ -1,4 +1,30 @@
module Patch where
module Patch (patchToks, pprPatchWarn) where
patchToks :: a
patchToks = undefined
import Format
import Types
import Merge
import qualified Data.ByteString as B
data PatchWarn = HunkMatched Hunk Int | HunkFailed Hunk
pprPatchWarn :: PatchWarn -> String
pprPatchWarn (HunkMatched ((o,n), _) at) = "hunk (-"++ show o ++ " +"++show n++") succeeded at "++show at
pprPatchWarn (HunkFailed ((o,n),_)) = "hunk (-"++ show o ++ " +"++show n++") FAILED"
patchToks :: TV -> [Hunk] -> Bool -> MergeOpts -> (TV, [Hunk], [PatchWarn])
patchToks toks hunks rev mopt = undefined
--let (tokss, hunk, sus) = patchHunksAt 0 0 0 (if rev then map revHunk hunks else hunks) mopt
patchHunksAt :: TV -> Int -> Int -> Int -> [Hunk] -> MergeOpts -> ([TV], [PatchWarn])
patchHunksAt toks tvoff origoff newoff hunks mopt = undefined
hunkToMatch :: Diff -> Diff
hunkToMatch = filter $ (`elem` [Keep, Remove, Original]) . fst
hunkToReplace :: Diff -> Diff
hunkToReplace = filter $ (`elem` [Keep, Add, MineChanged, YourChanged]) . fst
tokCmp :: Bool -> Tok -> Tok -> Bool
tokCmp True (False, _) (False, _) = True -- ignore the whitespace change
tokCmp _ a b = a == b -- otherwise just compare

View file

@ -1,177 +0,0 @@
{-# LANGUAGE FlexibleContexts #-}
module Redfa where
import Control.Monad
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import Data.ByteString.UTF8 (fromString, toString)
import Data.List
import Data.Maybe
import qualified Data.Vector as V
import Options.Applicative
import Substr
import Text.Regex.TDFA
import Types
data RedfaOption
= RedfaOptionRules [BS]
| RedfaOptionFile String
deriving (Show)
data RedfaSpec =
RedfaSpec
{ redfaStart :: Int
, redfaNames :: V.Vector BS
, redfaRules :: V.Vector [RedfaRule] --start, mid, jump, token
}
data RedfaRule =
RedfaRule
{ rrRegexStart :: Regex
, rrRegexMid :: Regex
, rrJump :: Int
, rrIsToken :: Bool
}
redfaOption :: Parser RedfaOption
redfaOption =
RedfaOptionRules <$>
some
(strOption $
metavar "RULE" <>
short 'L' <>
long "lex" <> help "Lexing rule (specify repeatedly for more rules)") <|>
RedfaOptionFile <$>
strOption
(metavar "FILE" <>
short 'l' <>
long "lexer" <> help "File from where to load the lexing rules")
redfaOptionToRuleStrings :: RedfaOption -> IO [BS]
redfaOptionToRuleStrings (RedfaOptionRules x) = return x
redfaOptionToRuleStrings (RedfaOptionFile fn) =
B8.lines <$> B.readFile fn -- TODO improve
splitFirst :: Char -> BS -> (BS, BS)
splitFirst c s = B.splitAt (fromMaybe (B.length s) $ B8.elemIndex c s) s
redfaRuleStringToRuleStr :: BS -> Maybe (BS, BS, BS, Bool)
redfaRuleStringToRuleStr s =
let (spec, regex) = splitFirst ':' s
(from, to) = splitFirst '>' spec
sf = B8.strip from
(cleanFrom, isToken)
| B.null sf = (sf, True)
| B.head sf == fromIntegral (fromEnum '_') = (B.tail sf, False)
| otherwise = (sf, True)
go
| B.null s = Nothing
| B.head s == 35 = Nothing -- # comment
| B.null regex = Just (B.empty, B.empty, spec, isToken)
| B.null to = Just (cleanFrom, cleanFrom, B.tail regex, isToken)
| otherwise =
Just (cleanFrom, B8.strip $ B.tail to, B.tail regex, isToken)
in go
unescapeRegex :: MonadFail m => BS -> m BS
unescapeRegex s' = BL.toStrict . BB.toLazyByteString <$> unescape' s'
where
unescape' :: MonadFail m => BS -> m BB.Builder
unescape' s
| B.null s = return mempty
| B.head s == BI.c2w '\\' && B.null (B.tail s) =
fail "incomplete escape sequence"
| B.head s == BI.c2w '\\' =
let rest = B.tail s
cc = B.head rest
thechar =
BB.stringUtf8 $
case BI.w2c cc of
'`' -> "\\`"
'\'' -> "\\'"
'b' -> "\\b"
'B' -> "\\B"
'<' -> "\\<"
'>' -> "\\>"
'a' -> "\a"
'e' -> "\x1b"
'f' -> "\f"
'n' -> "\n"
'r' -> "\r"
't' -> "\t"
'\\' -> "\\\\"
a -> [a] --TODO add support for \x and \u
in (thechar <>) <$> unescape' (B.tail rest)
| otherwise = mappend (BB.word8 $ B.head s) <$> unescape' (B.tail s)
redfaPrepareRules :: RedfaOption -> IO RedfaSpec
redfaPrepareRules opt = do
(states, jumps, regexes, isToken) <-
unzip4 . mapMaybe redfaRuleStringToRuleStr <$> redfaOptionToRuleStrings opt
uRegexes <- traverse unescapeRegex regexes
startREs <- traverse (makeRegexM . (fromString "\\`" <>)) uRegexes
midREs <- traverse (makeRegexM . (fromString "\\`(.|\n)" <>)) uRegexes
let ids = nub . sort $ states ++ jumps
index' = fromJust . flip elemIndex ids
statesIds = map index' states
jumpsIds = map index' jumps
start = index' $ head states
rules :: [[RedfaRule]]
rules = do
stateId <- [0 .. length ids - 1]
return $ do
(a, b, (rs, rm), t) <-
zip4 statesIds jumpsIds (zip startREs midREs) isToken
guard $ a == stateId
return $ RedfaRule rs rm b t
return $ RedfaSpec start (V.fromList ids) (V.fromList rules)
redfaTokenize :: MonadFail m => RedfaSpec -> BS -> m [Tok]
redfaTokenize spec s = redfaTokenize' spec s (redfaStart spec) 0 []
redfaTokenize' ::
MonadFail m => RedfaSpec -> BS -> Int -> Int -> [Int] -> m [Tok]
redfaTokenize' spec s state off visited
| off >= B.length s = pure []
| otherwise =
let (ooff, reg) =
if off == 0
then (0, rrRegexStart)
else (1, rrRegexMid)
matchString = B.drop (off - ooff) s
matches :: [(RedfaRule, (MatchOffset, MatchLength))]
matches =
filter contOK $
zip rules $ map (\x -> match (reg x) matchString) rules
contOK (RedfaRule {rrJump = j}, (off', len))
| off' /= 0 = False
| len > ooff = True
| otherwise = j `notElem` visited
in case matches of
[] ->
fail $
"Tokenization could not continue from " <>
stateStrPretty <> " at offset " <> show (off + ooff)
((rule, (_, len)):_) ->
let matchLen = len - ooff
in (if matchLen > 0
then (:) (rrIsToken rule, substr off matchLen s)
else id) <$>
redfaTokenize'
spec
s
(rrJump rule)
(off + matchLen)
(if matchLen > 0
then []
else state : visited)
where
rules = redfaRules spec V.! state
stateStr = redfaNames spec V.! state
stateStrPretty
| B.null stateStr = "anonymous state"
| otherwise = "state `" <> toString stateStr <> "'"

View file

@ -1,7 +0,0 @@
module Substr where
import qualified Data.ByteString as B
import Types
substr :: Int -> Int -> BS -> BS
substr b l = B.take l . B.drop b

114
src/Tokenizers.hs Normal file
View file

@ -0,0 +1,114 @@
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE TupleSections #-}
module Tokenizers (TokOpts, tokOptions, tokenize) where
import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as BU8
import qualified Data.Word8 as W8
import Options.Applicative
import Types
import qualified Unicode.Char.General as UC
data TokOpts =
TokOpts
{ optTokenizerName :: String
}
deriving (Show)
tokOptions :: Parser TokOpts
tokOptions =
TokOpts <$>
strOption
(metavar "LEXER" <>
short 'l' <>
long "lexer" <>
value "text" <>
help "Lexer name (use --help-lexers to list available ones)")
--TODO this should later allow choosing the lexer by suffix
tokenize :: MonadFail m => TokOpts -> FilePath -> BS -> m [Tok]
tokenize topt _ =
case filter ((== optTokenizerName topt) . tkName) tokenizers of
[t] -> runTokenizer t
_ -> const $ fail "Lexer not found"
data Tokenizer =
Tokenizer
{ tkName :: String
, tkDescription :: String
, runTokenizer :: forall m. MonadFail m =>
BS -> m [Tok]
}
tokenizers :: [Tokenizer]
tokenizers =
[ Tokenizer "lines" "works like the traditional diff" tokenizeLines
, Tokenizer "asciiwords" "separate by ASCII whitespace" tokenizeWords8
, Tokenizer "words" "separate on any UTF8 spacing" tokenizeWords
, Tokenizer
"text"
"separate groups of similar-class UTF8 characters"
tokenizeUnicode
]
tokenizeLines :: MonadFail m => BS -> m [Tok]
tokenizeLines = pure . map (True, ) . BU8.lines'
tokenizeWords8 :: MonadFail m => BS -> m [Tok]
tokenizeWords8 =
pure . map (\x -> (not $ W8.isSpace $ B.head x, x)) . bsGroupOn W8.isSpace
where
bsGroupOn f = B.groupBy (\l r -> f l == f r)
tokenizeWords :: MonadFail m => BS -> m [Tok]
tokenizeWords = pure . map makeTokenBU8 . groupOnBU8 UC.isWhiteSpace
makeTokenBU8 :: BS -> Tok
makeTokenBU8 s = if
maybe False (not . UC.isWhiteSpace . fst) (BU8.decode s)
then (True, s)
else (False, s)
{- needed?
foldri :: (Char -> Int -> a -> a) -> a -> BS -> a
foldri cons nil cs =
case decode cs of
Just (a, l) -> cons a l (foldrS cons nil $ drop l cs)
Nothing -> nil
-}
spanBU8 :: (Char -> Bool) -> BS -> (BS, BS)
spanBU8 f orig = B.splitAt (go 0 orig) orig
where
go :: Int -> BS -> Int
go n bs =
case BU8.decode bs of
Just (a, l) ->
if f a
then go (n + l) (B.drop l bs)
else n
Nothing -> n
groupByBU8 :: (Char -> Char -> Bool) -> BS -> [BS]
groupByBU8 f s =
case BU8.decode s of
Just (a, _) ->
let (g, s') = spanBU8 (f a) s
in g : groupByBU8 f s'
Nothing -> []
groupOnBU8 :: Eq a => (Char -> a) -> BS -> [BS]
groupOnBU8 f = groupByBU8 (\l r -> f l == f r)
tokenizeUnicode :: MonadFail m => BS -> m [Tok]
tokenizeUnicode = pure . map makeTokenBU8 . groupOnBU8 simpleUnicodeCategory
simpleUnicodeCategory :: Char -> Int
simpleUnicodeCategory c
| UC.isWhiteSpace c = 1
| UC.isAlphaNum c = 2
| UC.isSymbol c = 3
| UC.isPunctuation c = 4
| UC.isSeparator c = 5
| otherwise = 0

View file

@ -1,3 +1,5 @@
--{-# LANGUAGE BangPatterns #-}
module Types where
import Data.ByteString