src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 -- import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Data.Functor.Reverse
  48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
  49 import Control.Monad (forM_)
  50 import Data.Ord (Ord)
  51 import qualified Data.List as L
  52 import Data.Monoid
  53 import Data.Text (Text)
  54 import qualified Data.Text as T
  55 import Data.Map (Map)
  56 import Data.Maybe (fromMaybe)
  57 import qualified Data.Map as Map
  58 import Gargantext.Prelude hiding (cs)
  59 import qualified Gargantext.Prelude as GP
  60 import qualified Data.Tree as Tree
  61 import Data.Tree (Tree)
  62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  63
  64 type Entropy e =
  65   ( Fractional e
  66   , Floating e
  67   , P.RealFloat e
  68   , Show e
  69   -- ^ TODO: only used for debugging
  70   )
  71 ------------------------------------------------------------------------
  72 -- | Example and tests for development
  73 data I e = I
  74   { _info_entropy    :: e
  75   , _info_entropy_var :: e
  76   , _info_autonomy   :: e
  77   }
  78
  79 instance Show e => Show (I e) where
  80   show (I e v n) = show (e, v, n)
  81
  82 makeLenses ''I
  83
  84 type ModEntropy i o e = (e -> e) -> i -> o
  85
  86 set_autonomy :: ModEntropy e (I e) e
  87 set_autonomy f e = I e e (f e)
  88
  89 set_entropy_var :: ModEntropy e (I e) e
  90 set_entropy_var f e = I e (f e) e
  91
  92
  93 data StartStop = Start | Stop
  94   deriving (Ord, Eq, Show)
  95
  96 data Token = NonTerminal Text
  97            | Terminal StartStop
  98   deriving (Ord, Eq, Show)
  99
 100 isTerminal :: Token -> Bool
 101 isTerminal (Terminal    _) = True
 102 isTerminal (NonTerminal _) = False
 103
 104 parseToken :: Text -> Token
 105 parseToken "<start>" = Terminal Start
 106 parseToken "<stop>"  = Terminal Stop
 107 parseToken t         = NonTerminal t
 108
 109 -- |
 110 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
 111 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
 112 reverseTokens :: [Token] -> [Token]
 113 reverseTokens xs = case lastMay xs of
 114   Nothing -> []
 115   Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
 116   _                    -> reverseTokens' xs
 117
 118 reverseTokens' :: [Token] -> [Token]
 119 reverseTokens' [] = []
 120 reverseTokens' [Terminal Stop] = []
 121 reverseTokens' [x] = [x]
 122 reverseTokens' (x:xs) = case x of
 123               Terminal Start -> [Terminal Start] <> reverseTokens' xs
 124               _              -> reverseTokens' xs <> [x]
 125
 126
 127 toToken :: [Text] -> [Token]
 128 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 129
 130 printToken :: Token -> Text
 131 printToken = f
 132   where
 133     f (NonTerminal x)  = x
 134     f (Terminal Start) = "<start>"
 135     f (Terminal Stop)  = "<stop>"
 136 ------------------------------------------------------------------------
 137
 138 data Trie k e
 139   = Node { _node_count    :: Int
 140          , _node_entropy  :: e
 141          , _node_children :: Map k (Trie k e)
 142          }
 143  | Leaf { _node_count    :: Int }
 144   deriving (Show)
 145
 146 makeLenses ''Trie
 147
 148 insertTries :: Ord k => [[k]] -> Trie k ()
 149 insertTries = L.foldr insertTrie emptyTrie
 150
 151 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 152 insertTrie []     n                    = n { _node_count = _node_count n +1}
 153 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 154 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 155   where
 156     f = Just . insertTrie xs . fromMaybe emptyTrie
 157
 158 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 159 -- emptyTrie = Node 0 mempty mempty
 160 emptyTrie :: Trie k e
 161 emptyTrie  = Leaf 0
 162
 163 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 164 mkTrie c children
 165   | Map.null children = Leaf c
 166   | otherwise         = Node c mempty children
 167
 168                         -----------------------------
 169 -- | Trie to Tree since Tree as nice print function
 170 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 171 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 172 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 173
 174 ------------------------------------------------------------------------
 175 ------------------------------------------------------------------------
 176 nan :: Floating e => e
 177 nan = 0 / 0
 178
 179 noNaNs :: P.RealFloat e => [e] -> [e]
 180 noNaNs = filter (not . P.isNaN)
 181
 182 updateIfDefined :: P.RealFloat e => e -> e -> e
 183 updateIfDefined e0 e | P.isNaN e = e0
 184                      | otherwise = e
 185
 186 sim :: Entropy e => e -> e -> Bool
 187 sim x y = x == y || (P.isNaN x && P.isNaN y)
 188
 189 subst :: Entropy e => (e, e) -> e -> e
 190 subst (src, dst) x | sim src x = dst
 191                    | otherwise = x
 192
 193 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
 194 entropyTrie _    (Leaf c)             = Leaf c
 195 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 196   where
 197     e = sum $ map f $ Map.toList children
 198     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 199                              else - chc * P.logBase 2 chc
 200       where
 201         chc = fromIntegral (_node_count child) / fromIntegral c
 202 ------------------------------------------------------------------------
 203 normalizeLevel :: Entropy e => e -> e -> e -> e -> e
 204 normalizeLevel prev m v e = ((e - prev) - m) / v
 205
 206 {- Unused
 207
 208 nodeChildren :: Trie k e -> Map k (Trie k e)
 209 nodeChildren (Node _ _ cs) = cs
 210 nodeChildren (Leaf _)      = Map.empty
 211
 212 -}
 213
 214
 215 class IsTrie trie where
 216   buildTrie   :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
 217   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 218   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 219   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 220   normalizeEntropy :: Entropy e
 221                    => Getting e i e -> ModEntropy i o e
 222                    -> trie k i -> trie k o
 223
 224 -- UNUSED
 225 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 226 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 227
 228 instance IsTrie Trie where
 229   buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
 230
 231   nodeEntropy inE (Node _ e _) = e ^. inE
 232   nodeEntropy _   (Leaf _)     = nan
 233
 234   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 235   nodeChild _ (Leaf _)      = emptyTrie
 236
 237   findTrie ks t = L.foldl (flip nodeChild) t ks
 238
 239   normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
 240     where
 241       go _ []                _                   = panic "normalizeEntropy' empty levels"
 242       go _ _                 (Leaf c)            = Leaf c
 243       go f ((m, v, _) : ess) (Node c i children)
 244         = Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) m v) ess <$> children
 245
 246
 247   {-
 248   This is only normalizing a node with respect to its brothers (unlike all the
 249   nodes of the same level).
 250
 251   normalizeEntropy inE modE = go $ modE identity
 252     where
 253       go _ (Leaf c) = Leaf c
 254       go f (Node c i children)
 255         | Map.null children =
 256             panic "normalizeEntropy: impossible"
 257         | otherwise         =
 258             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 259           where
 260             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 261   -}
 262 ------------------------------------------------------------------------
 263
 264 levels :: Trie k e -> [[Trie k e]]
 265 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 266   where
 267     subForest :: Trie k e -> [Trie k e]
 268     subForest (Leaf _)            = []
 269     subForest (Node _ _ children) = Map.elems children
 270
 271 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 272 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 273
 274 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
 275 normalizationLevels inE = fmap f . entropyLevels inE
 276   where
 277     f es = (mean es, deviation es, length es)
 278
 279 ------------------------------------------------------------------------
 280
 281 data Tries k e = Tries
 282   { _fwd :: Trie k e
 283   , _bwd :: Trie k e
 284   }
 285
 286 makeLenses ''Tries
 287
 288
 289
 290 instance IsTrie Tries where
 291   buildTrie to n tts = Tries { _fwd = buildTrie to n tts
 292                              , _bwd = buildTrie to n (map reverse $ tts)
 293                              }
 294
 295   nodeEntropy inE (Tries fwd bwd) =
 296     -- VETODO reverse the query for bwd here
 297     -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
 298     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 299
 300   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
 301   --                                                              ^^
 302   -- TODO: here this is tempting to reverse but this is not always what we
 303   -- want. See also nodeAutonomy.
 304   -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
 305   -- since recursivity of the function makes the reverse multiple times (I guess)
 306
 307   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 308
 309   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 310
 311 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 312 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 313
 314 ------------------------------------------------------------------------
 315 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 316 split _   _  [] = []
 317 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 318 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 319   where
 320     consRev [] xss = xss
 321     consRev xs xss = reverse xs : xss
 322
 323     go _ pref []                  = [reverse pref]
 324     go _ pref (Terminal Stop:_)   = [reverse pref]
 325     go t pref (Terminal Start:xs) = go t pref xs
 326     go t pref (x:xs) =
 327         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 328         if acc
 329           then go xt (x:pref) xs
 330           else consRev pref $ go xt0 [x] xs
 331       where
 332         xt   = nodeChild x t
 333         xt0  = nodeChild x t0
 334         et   = ne 0 t
 335     --  ^ entropy of the current prefix
 336         ext0 = ne 0 xt0
 337     --  ^ entropy of [x]
 338         ext  = ne 0 xt
 339     --  ^ entropy of the current prefix plus x
 340         acc  = ext > et + ext0
 341         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 342
 343     ne d t = if P.isNaN e then d else e
 344       where e = nodeEntropy inE t
 345
 346 {-
 347 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 348 split inE t0 ts =
 349   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 350 -}
 351
 352 ------------------------------------------------------------------------
 353 ------------------------------------------------------------------------
 354
 355 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 356 mainEleve _ _ = []
 357 {-
 358 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 359   where
 360     inp = toToken <$> input
 361     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 362 -}
 363
 364 chunkAlongEleve :: Int -> [a] -> [[a]]
 365 chunkAlongEleve n xs = L.take n <$> L.tails xs
 366
 367 toToken' :: Int -> [[Text]] -> [[Token]]
 368 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
 369
 370 ---------------------------------------------
 371 set_entropy_vars :: Entropy e  => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
 372 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
 373   mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
 374
 375 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
 376 mapTree f k t = go f k t
 377   where
 378     go _ _ (Leaf c)            = Leaf c
 379     go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
 380
 381 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
 382 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
 383   where
 384     fwd = (nodeEntropy inE (_fwd $ findTrie                ng  tries))
 385     bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
 386
 387 ---------------------------------------------
 388 -- | TODO remove function below after following bug fixed
 389 -- | TODO entropy_var' /= entropy_var on "<start> token.."
 390 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
 391 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
 392                              , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
 393                              ]
 394                         )
 395
 396 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
 397 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
 398                             , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
 399                             ]
 400                         )
 401   where
 402     ntf = parseToken <$> ng
 403     ntb = parseToken <$> reverse ng
 404
 405 ---------------------------------------------
 406
 407 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 408 testEleve debug n output checks = do
 409   let
 410     {-
 411     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 412           | ps <- L.nub $ [ c
 413                           | m <- [1..n]
 414                           , cs <- chunkAlong m 1 <$> inp
 415                           , c <- cs
 416                           ]
 417           ]
 418     -}
 419   --res = map (map printToken) . split identity fwd <$> inp
 420   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 421     res = map (map printToken) . split info_autonomy nt <$> inp
 422   when debug $ do
 423     P.putStrLn (show input)
 424     -- forM_ pss (P.putStrLn . show)
 425     P.putStrLn ""
 426     P.putStrLn "Forward:"
 427     printTrie (_fwd nt)
 428     P.putStrLn ""
 429     P.putStrLn "Backward:"
 430     printTrie (_bwd nt)
 431     P.putStrLn ""
 432     P.putStrLn "Levels:"
 433     forM_ (normalizationLevels identity t'') $ \level ->
 434       P.putStrLn $ "  " <> show level
 435     P.putStrLn ""
 436     P.putStrLn "Entropy Var:"
 437     printTrie t''
 438     P.putStrLn ""
 439     P.putStrLn "Splitting:"
 440     P.putStrLn $ show res
 441   forM_ checks checker
 442   pure $ expected == res
 443
 444   where
 445     out      = T.words <$> output
 446     expected = fmap (T.splitOn "-") <$> out
 447     input    = (T.splitOn "-" =<<) <$> out
 448     inp      = toToken <$> input
 449
 450     t :: Tries Token Double
 451     t        = buildTrie toToken' n input
 452              & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
 453              -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
 454
 455     nt :: Tries Token (I Double)
 456     nt = normalizeEntropy identity set_autonomy t
 457
 458     t'' :: Trie Token Double
 459     t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
 460
 461     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 462     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 463
 464     check f msg ref my =
 465       if f ref my
 466         then P.putStrLn $ "    PASS " <> msg <> " " <> show ref
 467         else P.putStrLn $ "    FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
 468
 469     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 470       let ns  = parseToken <$> T.words ngram
 471           nsb = parseToken <$> (reverse $ T.words ngram)
 472           t'  = findTrie ns t
 473           tvar  = findTrie ns  t''
 474           nt' = findTrie ns nt
 475
 476       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 477       check (==) "count"       count       (_node_count tvar)
 478       check sim  "entropy_var"      entropy (nodeEntropy identity tvar)
 479       --check sim  ("entropy_varOK")  entropy (entropy_var (T.words ngram) nt)
 480       --check sim  "entropy"     entropy  (entropy_var' nt (parseToken <$> T.words ngram))
 481       {- ^ FIXME 2 fun above should have same results (error in reverseToken):
 482         <start> New York:
 483         PASS count 1
 484         FAIL entropy ref=NaN my=0.0
 485       -}
 486
 487       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy nt')
 488       check sim  "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
 489       check sim  "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
 490
 491     printTrie :: Show e => Trie Token e -> IO ()
 492     printTrie =
 493       P.putStrLn . Tree.drawTree
 494                  . fmap show
 495                  . toTree (NonTerminal "")
 496
 497 -- | TODO real data is a list of tokenized sentences
 498 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 499 example0 =  ["New-York is New-York and New-York"]
 500 example1 =  ["to-be or not to-be"]
 501 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 502 example3 =  example0 <> example0
 503        -- > TEST: Should not have York New in the trie
 504 example4 =  ["a-b-c-d e a-b-c-d f"]
 505 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 506 example6 =  ["le-petit chat"
 507             ,"le-petit chien"
 508             ,"le-petit rat"
 509             ,"le gros rat"
 510             ]
 511
 512 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 513
 514 checks0 =
 515   [("<start>", 1, nan, nan, nan, nan, 0.0)
 516   ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 517   ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 518   ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 519   ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 520 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
 521 -- need to count it.
 522
 523 --{-
 524   ,("<start> New", 1, nan, nan, nan, nan, 0.0)
 525   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 526   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 527   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 528   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 529   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 530   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 531
 532   ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
 533   ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
 534   ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
 535   ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
 536   ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
 537   ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
 538   ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
 539   ,("New York <stop>", 1, nan, nan, nan, nan, nan)
 540 --}
 541   ]
 542
 543
 544
 545 checks2 =
 546   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 547   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 548   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 549   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 550   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 551   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 552   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 553   ]
 554
 555
 556 runTests :: IO ()
 557 runTests =
 558   forM_
 559     [("example0", 2, example0, checks0)
 560     ,("example1", 2, example1, [])
 561     ,("example2", 3, example2, checks2)
 562     ,("example3", 2, example3, [])
 563     ,("example4", 4, example4, [])
 564     ,("example5", 5, example5, [])
 565     ]
 566     (\(name, n, ex, checks) -> do
 567       P.putStrLn $ name <> " " <> show n
 568       b <- testEleve False n ex checks
 569       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 570     )