src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 -- import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Data.Functor.Reverse
  48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
  49 import Control.Monad (forM_)
  50 import Data.Ord (Ord)
  51 import qualified Data.List as L
  52 import Data.Monoid
  53 import Data.Text (Text)
  54 import qualified Data.Text as T
  55 import Data.Map (Map)
  56 import Data.Maybe (fromMaybe)
  57 import qualified Data.Map as Map
  58 import Gargantext.Prelude hiding (cs)
  59 import qualified Gargantext.Prelude as GP
  60 import qualified Data.Tree as Tree
  61 import Data.Tree (Tree)
  62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  63
  64 type Entropy e =
  65   ( Fractional e
  66   , Floating e
  67   , P.RealFloat e
  68   , Show e
  69   -- ^ TODO: only used for debugging
  70   )
  71 ------------------------------------------------------------------------
  72 -- | Example and tests for development
  73 data I e = I
  74   { _info_entropy    :: e
  75   , _info_entropy_var :: e
  76   , _info_autonomy   :: e
  77   }
  78
  79 instance Show e => Show (I e) where
  80   show (I e v n) = show (e, v, n)
  81
  82 makeLenses ''I
  83
  84 type ModEntropy i o e = (e -> e) -> i -> o
  85
  86 set_autonomy :: ModEntropy e (I e) e
  87 set_autonomy f e = I e e (f e)
  88
  89 set_entropy_var :: ModEntropy e (I e) e
  90 set_entropy_var f e = I e (f e) e
  91
  92
  93 data StartStop = Start | Stop
  94   deriving (Ord, Eq, Show)
  95
  96 data Token = NonTerminal Text
  97            | Terminal StartStop
  98   deriving (Ord, Eq, Show)
  99
 100 isTerminal :: Token -> Bool
 101 isTerminal (Terminal    _) = True
 102 isTerminal (NonTerminal _) = False
 103
 104 parseToken :: Text -> Token
 105 parseToken "<start>" = Terminal Start
 106 parseToken "<stop>"  = Terminal Stop
 107 parseToken t         = NonTerminal t
 108
 109 -- |
 110 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
 111 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
 112 reverseTokens :: [Token] -> [Token]
 113 reverseTokens xs = case lastMay xs of
 114   Nothing -> []
 115   Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
 116   _                    -> reverseTokens' xs
 117
 118 reverseTokens' :: [Token] -> [Token]
 119 reverseTokens' [] = []
 120 reverseTokens' [Terminal Stop] = []
 121 reverseTokens' [x] = [x]
 122 reverseTokens' (x:xs) = case x of
 123               Terminal Start -> [Terminal Start] <> reverseTokens' xs
 124               _              -> reverseTokens' xs <> [x]
 125
 126
 127 toToken :: [Text] -> [Token]
 128 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 129
 130 printToken :: Token -> Text
 131 printToken = f
 132   where
 133     f (NonTerminal x)  = x
 134     f (Terminal Start) = "<start>"
 135     f (Terminal Stop)  = "<stop>"
 136 ------------------------------------------------------------------------
 137
 138 data Trie k e
 139   = Node { _node_count    :: Int
 140          , _node_entropy  :: e
 141          , _node_children :: Map k (Trie k e)
 142          }
 143  | Leaf { _node_count    :: Int }
 144   deriving (Show)
 145
 146 makeLenses ''Trie
 147
 148 insertTries :: Ord k => [[k]] -> Trie k ()
 149 insertTries = L.foldr insertTrie emptyTrie
 150
 151 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 152 insertTrie []     n                    = n { _node_count = _node_count n +1}
 153 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 154 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 155   where
 156     f = Just . insertTrie xs . fromMaybe emptyTrie
 157
 158 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 159 -- emptyTrie = Node 0 mempty mempty
 160 emptyTrie :: Trie k e
 161 emptyTrie  = Leaf 0
 162
 163 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 164 mkTrie c children
 165   | Map.null children = Leaf c
 166   | otherwise         = Node c mempty children
 167
 168                         -----------------------------
 169 -- | Trie to Tree since Tree as nice print function
 170 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 171 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 172 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 173
 174 ------------------------------------------------------------------------
 175 ------------------------------------------------------------------------
 176 nan :: Floating e => e
 177 nan = 0 / 0
 178
 179 noNaNs :: P.RealFloat e => [e] -> [e]
 180 noNaNs = filter (not . P.isNaN)
 181
 182 updateIfDefined :: P.RealFloat e => e -> e -> e
 183 updateIfDefined e0 e | P.isNaN e = e0
 184                      | otherwise = e
 185
 186 sim :: Entropy e => e -> e -> Bool
 187 sim x y = x == y || (P.isNaN x && P.isNaN y)
 188
 189 subst :: Entropy e => (e, e) -> e -> e
 190 subst (src, dst) x | sim src x = dst
 191                    | otherwise = x
 192
 193 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
 194 entropyTrie _    (Leaf c)             = Leaf c
 195 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 196   where
 197     e = sum $ map f $ Map.toList children
 198     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 199                              else - chc * P.logBase 2 chc
 200       where
 201         chc = fromIntegral (_node_count child) / fromIntegral c
 202 ------------------------------------------------------------------------
 203 normalizeLevel :: Entropy e => e -> [e] -> e -> e
 204 normalizeLevel prev = go . noNaNs
 205
 206   where
 207     go es  = \e -> ((e - prev) - m) / v
 208       where
 209         m  = mean      es
 210         v  = deviation es
 211
 212 {- Unused
 213
 214 nodeChildren :: Trie k e -> Map k (Trie k e)
 215 nodeChildren (Node _ _ cs) = cs
 216 nodeChildren (Leaf _)      = Map.empty
 217
 218 -}
 219
 220
 221 class IsTrie trie where
 222   buildTrie   :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
 223   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 224   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 225   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 226
 227 -- UNUSED
 228 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 229 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 230
 231 instance IsTrie Trie where
 232   buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
 233
 234   nodeEntropy inE (Node _ e _) = e ^. inE
 235   nodeEntropy _   (Leaf _)     = nan
 236
 237   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 238   nodeChild _ (Leaf _)      = emptyTrie
 239
 240   findTrie ks t = L.foldl (flip nodeChild) t ks
 241
 242 normalizeEntropy :: Entropy e
 243                  => Getting e i e -> ModEntropy i o e
 244                  -> Trie k i -> Trie k o
 245 normalizeEntropy inE modE t = go (modE identity) level t
 246   where
 247     level = (entropyLevels inE t)
 248     go _ []         _                   = panic "normalizeEntropy' empty levels"
 249     go _ _          (Leaf c)            = Leaf c
 250 --      go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 251     go f (es : ess) (Node c i children)
 252   --  | any (sim (i ^. inE)) es
 253       = Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) es) ess <$> children
 254   --  | otherwise
 255   --  = panic "NOT an elem"
 256
 257
 258   {-
 259   This is only normalizing a node with respect to its brothers (unlike all the
 260   nodes of the same level).
 261
 262   normalizeEntropy inE modE = go $ modE identity
 263     where
 264       go _ (Leaf c) = Leaf c
 265       go f (Node c i children)
 266         | Map.null children =
 267             panic "normalizeEntropy: impossible"
 268         | otherwise         =
 269             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 270           where
 271             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 272   -}
 273 ------------------------------------------------------------------------
 274
 275 levels :: Trie k e -> [[Trie k e]]
 276 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 277   where
 278     subForest :: Trie k e -> [Trie k e]
 279     subForest (Leaf _)            = []
 280     subForest (Node _ _ children) = Map.elems children
 281
 282 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 283 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 284
 285 ------------------------------------------------------------------------
 286
 287 data Tries k e = Tries
 288   { _fwd :: Trie k e
 289   , _bwd :: Trie k e
 290   }
 291
 292 makeLenses ''Tries
 293
 294
 295
 296 instance IsTrie Tries where
 297   buildTrie to n tts = Tries { _fwd = buildTrie to n tts
 298                              , _bwd = buildTrie to n (map reverse $ tts)
 299                              }
 300
 301   nodeEntropy inE (Tries fwd bwd) =
 302     -- VETODO reverse the query for bwd here
 303     -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
 304     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 305
 306   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
 307   --                                                              ^^
 308   -- TODO: here this is tempting to reverse but this is not always what we
 309   -- want. See also nodeAutonomy.
 310   -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
 311   -- since recursivity of the function makes the reverse multiple times (I guess)
 312
 313   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 314
 315 ------------------------------------------------------------------------
 316 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 317 split _   _  [] = []
 318 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 319 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 320   where
 321     consRev [] xss = xss
 322     consRev xs xss = reverse xs : xss
 323
 324     go _ pref []                  = [reverse pref]
 325     go _ pref (Terminal Stop:_)   = [reverse pref]
 326     go t pref (Terminal Start:xs) = go t pref xs
 327     go t pref (x:xs) =
 328         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 329         if acc
 330           then go xt (x:pref) xs
 331           else consRev pref $ go xt0 [x] xs
 332       where
 333         xt   = nodeChild x t
 334         xt0  = nodeChild x t0
 335         et   = ne 0 t
 336     --  ^ entropy of the current prefix
 337         ext0 = ne 0 xt0
 338     --  ^ entropy of [x]
 339         ext  = ne 0 xt
 340     --  ^ entropy of the current prefix plus x
 341         acc  = ext > et + ext0
 342         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 343
 344     ne d t = if P.isNaN e then d else e
 345       where e = nodeEntropy inE t
 346
 347 {-
 348 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 349 split inE t0 ts =
 350   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 351 -}
 352
 353 ------------------------------------------------------------------------
 354 ------------------------------------------------------------------------
 355
 356 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 357 mainEleve _ _ = []
 358 {-
 359 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 360   where
 361     inp = toToken <$> input
 362     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 363 -}
 364
 365 chunkAlongEleve :: Int -> [a] -> [[a]]
 366 chunkAlongEleve n xs = L.take n <$> L.tails xs
 367
 368 toToken' :: Int -> [[Text]] -> [[Token]]
 369 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
 370
 371 ---------------------------------------------
 372 set_entropy_vars :: Entropy e  => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
 373 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
 374   mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
 375
 376 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
 377 mapTree f k t = go f k t
 378   where
 379     go _ _ (Leaf c)            = Leaf c
 380     go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
 381
 382 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
 383 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
 384   where
 385     fwd = (nodeEntropy inE (_fwd $ findTrie                ng  tries))
 386     bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
 387
 388 ---------------------------------------------
 389 -- | TODO remove function below after following bug fixed
 390 -- | TODO entropy_var' /= entropy_var on "<start> token.."
 391 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
 392 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
 393                              , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
 394                              ]
 395                         )
 396
 397 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
 398 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
 399                             , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
 400                             ]
 401                         )
 402   where
 403     ntf = parseToken <$> ng
 404     ntb = parseToken <$> reverse ng
 405
 406 ---------------------------------------------
 407
 408 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 409 testEleve debug n output checks = do
 410   let
 411     {-
 412     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 413           | ps <- L.nub $ [ c
 414                           | m <- [1..n]
 415                           , cs <- chunkAlong m 1 <$> inp
 416                           , c <- cs
 417                           ]
 418           ]
 419     -}
 420   --res = map (map printToken) . split identity fwd <$> inp
 421   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 422     res = map (map printToken) . split info_autonomy nt <$> inp
 423   when debug $ do
 424     P.putStrLn (show input)
 425     -- forM_ pss (P.putStrLn . show)
 426     P.putStrLn ""
 427     P.putStrLn "Forward:"
 428     printTrie (_fwd t)
 429     P.putStrLn ""
 430     P.putStrLn "Backward:"
 431     printTrie (_bwd t)
 432     P.putStrLn ""
 433     P.putStrLn "Levels:"
 434     forM_ (entropyLevels identity t'') $ \level ->
 435       P.putStrLn $ "  " <> show level
 436     P.putStrLn ""
 437     P.putStrLn "Normalized:"
 438     printTrie nt
 439     P.putStrLn ""
 440     P.putStrLn "Splitting:"
 441     P.putStrLn $ show res
 442   forM_ checks checker
 443   pure $ expected == res
 444
 445   where
 446     out      = T.words <$> output
 447     expected = fmap (T.splitOn "-") <$> out
 448     input    = (T.splitOn "-" =<<) <$> out
 449     inp      = toToken <$> input
 450
 451     t :: Tries Token Double
 452     t        = buildTrie toToken' n input
 453              & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
 454              -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
 455
 456     t'' :: Trie Token Double
 457     t'' = set_entropy_vars identity (\e _i -> e) t
 458
 459     nt :: Trie Token (I Double)
 460     nt = normalizeEntropy identity set_autonomy t''
 461
 462     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 463     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 464
 465     check f msg ref my =
 466       if f ref my
 467         then P.putStrLn $ "    PASS " <> msg <> " " <> show ref
 468         else P.putStrLn $ "    FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
 469
 470     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 471       let ns  = parseToken <$> T.words ngram
 472           nsb = parseToken <$> (reverse $ T.words ngram)
 473           t'  = findTrie ns t
 474           tvar  = findTrie ns  t''
 475           nt' = findTrie ns nt
 476
 477       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 478       check (==) "count"       count       (_node_count tvar)
 479       check sim  "entropy_var"      entropy (nodeEntropy identity tvar)
 480       --check sim  ("entropy_varOK")  entropy (entropy_var (T.words ngram) nt)
 481       --check sim  "entropy"     entropy  (entropy_var' nt (parseToken <$> T.words ngram))
 482       {- ^ FIXME 2 fun above should have same results (error in reverseToken):
 483         <start> New York:
 484         PASS count 1
 485         FAIL entropy ref=NaN my=0.0
 486       -}
 487
 488       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy nt')
 489       check sim  "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
 490       check sim  "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
 491
 492     printTrie :: Show e => Trie Token e -> IO ()
 493     printTrie =
 494       P.putStrLn . Tree.drawTree
 495                  . fmap show
 496                  . toTree (NonTerminal "")
 497
 498 -- | TODO real data is a list of tokenized sentences
 499 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 500 example0 =  ["New-York is New-York and New-York"]
 501 example1 =  ["to-be or not to-be"]
 502 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 503 example3 =  example0 <> example0
 504        -- > TEST: Should not have York New in the trie
 505 example4 =  ["a-b-c-d e a-b-c-d f"]
 506 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 507 example6 =  ["le-petit chat"
 508             ,"le-petit chien"
 509             ,"le-petit rat"
 510             ,"le gros rat"
 511             ]
 512
 513 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 514
 515 checks0 =
 516   [("<start>", 1, nan, nan, nan, nan, 0.0)
 517   ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 518   ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 519   ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 520   ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 521 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
 522 -- need to count it.
 523
 524 --{-
 525   ,("<start> New", 1, nan, nan, nan, nan, 0.0)
 526   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 527   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 528   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 529   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 530   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 531   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 532
 533   ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
 534   ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
 535   ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
 536   ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
 537   ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
 538   ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
 539   ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
 540   ,("New York <stop>", 1, nan, nan, nan, nan, nan)
 541 --}
 542   ]
 543
 544
 545
 546 checks2 =
 547   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 548   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 549   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 550   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 551   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 552   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 553   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 554   ]
 555
 556
 557 runTests :: IO ()
 558 runTests =
 559   forM_
 560     [("example0", 2, example0, checks0)
 561     ,("example1", 2, example1, [])
 562     ,("example2", 3, example2, checks2)
 563     ,("example3", 2, example3, [])
 564     ,("example4", 4, example4, [])
 565     ,("example5", 5, example5, [])
 566     ]
 567     (\(name, n, ex, checks) -> do
 568       P.putStrLn $ name <> " " <> show n
 569       b <- testEleve False n ex checks
 570       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 571     )