src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  25
  26 - AD: Real ngrams extraction test
  27   from Gargantext.Text.Terms import extractTermsUnsupervised
  28   docs <- runCmdRepl $ selectDocs 1004
  29   extractTermsUnsupervised 3 $ DT.intercalate " "
  30                         $ catMaybes
  31                         $ Gargantext.map _hyperdataDocument_abstract docs
  32
  33 -}
  34 {-# LANGUAGE ConstraintKinds   #-}
  35 {-# LANGUAGE NoImplicitPrelude #-}
  36 {-# LANGUAGE OverloadedStrings #-}
  37 {-# LANGUAGE RankNTypes        #-}
  38 {-# LANGUAGE TemplateHaskell   #-}
  39 {-# LANGUAGE TypeFamilies      #-}
  40
  41 module Gargantext.Text.Eleve where
  42
  43 -- import Debug.Trace (trace)
  44 -- import Debug.SimpleReflect
  45
  46 import Control.Lens hiding (levels, children)
  47 import Control.Monad (forM_)
  48 import Data.Ord (Ord)
  49 import qualified Data.List as L
  50 import Data.Monoid
  51 import Data.Text (Text)
  52 import qualified Data.Text as T
  53 import Data.Map (Map)
  54 import Data.Maybe (fromMaybe)
  55 import qualified Data.Map as Map
  56 import Gargantext.Prelude hiding (cs)
  57 import qualified Data.Tree as Tree
  58 import Data.Tree (Tree)
  59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  60
  61 nan :: Floating e => e
  62 nan = 0 / 0
  63
  64 noNaNs :: P.RealFloat e => [e] -> [e]
  65 noNaNs = filter (not . P.isNaN)
  66
  67 updateIfDefined :: P.RealFloat e => e -> e -> e
  68 updateIfDefined e0 e | P.isNaN e = e0
  69                      | otherwise = e
  70
  71 sim :: Entropy e => e -> e -> Bool
  72 sim x y = x == y || (P.isNaN x && P.isNaN y)
  73
  74 subst :: Entropy e => (e, e) -> e -> e
  75 subst (src, dst) x | sim src x = dst
  76                    | otherwise = x
  77 ------------------------------------------------------------------------
  78
  79 type Entropy e =
  80   ( Fractional e
  81   , Floating e
  82   , P.RealFloat e
  83   , Show e
  84   -- ^ TODO: only used for debugging
  85   )
  86 ------------------------------------------------------------------------
  87 -- | Example and tests for development
  88 data I e = I
  89   { _info_entropy    :: e
  90   , _info_entropy_var :: e
  91   , _info_autonomy   :: e
  92   }
  93
  94 instance Show e => Show (I e) where
  95   show (I e ev a) = show (e, ev, a)
  96
  97 makeLenses ''I
  98
  99 type ModEntropy i o e = (e -> e) -> i -> o
 100
 101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
 102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
 103
 104 set_entropy_var :: Entropy e => Setter e (I e) e e
 105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
 106
 107 data StartStop = Start | Stop
 108   deriving (Ord, Eq, Show)
 109
 110 data Token = NonTerminal Text
 111            | Terminal StartStop
 112   deriving (Ord, Eq, Show)
 113
 114 isTerminal :: Token -> Bool
 115 isTerminal (Terminal    _) = True
 116 isTerminal (NonTerminal _) = False
 117
 118 parseToken :: Text -> Token
 119 parseToken "<start>" = Terminal Start
 120 parseToken "<stop>"  = Terminal Stop
 121 parseToken t         = NonTerminal t
 122
 123 toToken :: [Text] -> [Token]
 124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 125
 126 printToken :: Token -> Text
 127 printToken = f
 128   where
 129     f (NonTerminal x)  = x
 130     f (Terminal Start) = "<start>"
 131     f (Terminal Stop)  = "<stop>"
 132 ------------------------------------------------------------------------
 133
 134 data Trie k e
 135   = Node { _node_count    :: Int
 136          , _node_entropy  :: e
 137          , _node_children :: Map k (Trie k e)
 138          }
 139  | Leaf { _node_count    :: Int }
 140   deriving (Show)
 141
 142 makeLenses ''Trie
 143
 144 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 145 insertTrie []     n                    = n { _node_count = _node_count n +1}
 146 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 147 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 148   where
 149     f = Just . insertTrie xs . fromMaybe emptyTrie
 150
 151 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 152 -- emptyTrie = Node 0 mempty mempty
 153 emptyTrie :: Trie k e
 154 emptyTrie  = Leaf 0
 155
 156 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 157 mkTrie c children
 158   | Map.null children = Leaf c
 159   | otherwise         = Node c mempty children
 160
 161                         -----------------------------
 162 -- | Trie to Tree since Tree as nice print function
 163 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 164 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 165 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 166
 167 ------------------------------------------------------------------------
 168 ------------------------------------------------------------------------
 169 normalizeLevel :: Entropy e => e -> e -> e -> e
 170 normalizeLevel m v e = (e - m) / v
 171
 172 {- Unused
 173
 174 nodeChildren :: Trie k e -> Map k (Trie k e)
 175 nodeChildren (Node _ _ cs) = cs
 176 nodeChildren (Leaf _)      = Map.empty
 177
 178 -}
 179
 180 chunkAlongEleve :: Int -> [a] -> [[a]]
 181 chunkAlongEleve n xs = L.take n <$> L.tails xs
 182
 183 data Direction = Backward | Forward
 184
 185 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
 186 buildTrie d n sentences
 187   = L.foldr insertTrie emptyTrie
 188   . L.concat
 189   $ ( filter (/= [Terminal (term d)])
 190     . chunkAlongEleve (n + 1)
 191     . order d
 192     )
 193  <$> sentences
 194   where
 195     order Forward  = identity
 196     order Backward = reverse
 197     term  Forward  = Stop
 198     term  Backward = Start
 199
 200 class IsTrie trie where
 201   entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
 202   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 203   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 204   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 205   printTrie   :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
 206   evTrie      :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
 207   normalizeEntropy :: Entropy e
 208                    => Getting e i e -> ModEntropy i o e
 209                    -> trie k i -> trie k o
 210
 211 instance IsTrie Trie where
 212
 213   entropyTrie _    (Leaf c)             = Leaf c
 214   entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 215     where
 216       children' = Map.toList children
 217       sum_count = sum $ _node_count . snd <$> children'
 218       e | sum_count == 0 = nan
 219         | otherwise      = sum $ f <$> children'
 220       f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 221                               else - chc * P.logBase 2 chc
 222         where
 223           chc = fromIntegral (_node_count child) / fromIntegral c
 224
 225   nodeEntropy inE (Node _ e _) = e ^. inE
 226   nodeEntropy _   (Leaf _)     = nan
 227
 228   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 229   nodeChild _ (Leaf _)      = emptyTrie
 230
 231   findTrie ks t = L.foldl (flip nodeChild) t ks
 232
 233   printTrie inE t = do
 234     P.putStrLn . Tree.drawTree
 235                 . fmap show
 236                 $ toTree (NonTerminal "") t
 237     P.putStrLn "  Levels:"
 238     forM_ (normalizationLevels inE t) $ \level ->
 239       P.putStrLn $ "    " <> show level
 240
 241   evTrie inE setEV = go nan
 242     where
 243       go _  (Leaf c)            = Leaf c
 244       go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
 245         where e1 = i ^. inE
 246
 247       ev 0  0  = nan
 248       ev i0 i1 = i1 - i0
 249
 250   normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
 251     where
 252       go _ _                 (Leaf c)            = Leaf c
 253       go _ []                _                   = panic "normalizeEntropy' empty levels"
 254       go f ((m, v, _) : ess) (Node c i children)
 255         = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
 256 ------------------------------------------------------------------------
 257
 258 levels :: Trie k e -> [[Trie k e]]
 259 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 260   where
 261     subForest :: Trie k e -> [Trie k e]
 262     subForest (Leaf _)            = []
 263     subForest (Node _ _ children) = Map.elems children
 264
 265 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 266 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
 267
 268 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
 269 normalizationLevels inE = fmap f . entropyLevels inE
 270   where
 271     f es = (mean es, deviation es, length es)
 272
 273 ------------------------------------------------------------------------
 274
 275 data Tries k e = Tries
 276   { _fwd :: Trie k e
 277   , _bwd :: Trie k e
 278   }
 279
 280 makeLenses ''Tries
 281
 282 buildTries :: Int -> [[Token]] -> Tries Token ()
 283 buildTries n sentences = Tries
 284   { _fwd = buildTrie Forward  n sentences
 285   , _bwd = buildTrie Backward n sentences
 286   }
 287
 288 instance IsTrie Tries where
 289
 290   nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
 291
 292   findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
 293
 294   nodeChild = onTries . nodeChild
 295
 296   entropyTrie = onTries . entropyTrie
 297
 298   evTrie inE setEV = onTries $ evTrie inE setEV
 299
 300   normalizeEntropy inE = onTries . normalizeEntropy inE
 301
 302   printTrie inE (Tries f b) = do
 303     P.putStrLn "Forward:"
 304     printTrie inE f
 305     P.putStrLn ""
 306     P.putStrLn "Backward:"
 307     printTrie inE b
 308
 309 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 310 onTries h (Tries f b) = Tries (h f) (h b)
 311
 312 ------------------------------------------------------------------------
 313 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 314 split _   _ [] = []
 315 split inE t (Terminal Start:xs) = split inE t xs
 316 split inE t (x0:xs0) = go [x0] xs0
 317   where
 318     mayCons [] xss = xss
 319     mayCons xs xss = xs : xss
 320
 321     go pref []                  = [pref]
 322     go pref (Terminal Stop:_)   = [pref]
 323     go _    (Terminal Start:_)  = panic "split impossible"
 324     go pref (x:xs) =
 325         -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
 326         if acc
 327           then go prefx xs
 328           else mayCons pref $ go [x] xs
 329       where
 330         prefx = pref <> [x]
 331         pt   = findTrie pref t
 332         pxt  = findTrie prefx t
 333         xt   = findTrie [x] t
 334         ept  = ne pt
 335     --  ^ entropy of the current prefix
 336         ext  = ne xt
 337     --  ^ entropy of [x]
 338         epxt = ne pxt
 339     --  ^ entropy of the current prefix plus x
 340         acc  = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > ept + ext)
 341
 342         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 343
 344     ne = nodeEntropy inE
 345
 346 {-
 347 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 348 split inE t0 ts =
 349   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 350 -}
 351
 352 ------------------------------------------------------------------------
 353
 354 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 355 mainEleve n input = map (map printToken) . split info_autonomy (t :: Tries Token (I Double)) <$> inp
 356   where
 357     inp = toToken <$> input
 358     t   = normalizeEntropy info_entropy_var set_autonomy
 359         . evTrie identity set_entropy_var
 360         . entropyTrie isTerminal
 361         $ buildTries n inp
 362
 363
 364 ---------------------------------------------
 365
 366 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
 367
 368 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
 369 testEleve debug n output checks = do
 370   let
 371     res = map (map printToken) . split info_autonomy nt <$> inp
 372   when debug $ do
 373     P.putStrLn $ show input
 374     P.putStrLn ""
 375     printTrie info_entropy nt
 376     P.putStrLn ""
 377     P.putStrLn "Splitting:"
 378     P.putStrLn $ show res
 379   forM_ checks checker
 380   pure $ expected == res
 381
 382   where
 383     out      = T.words <$> output
 384     expected = fmap (T.splitOn "-") <$> out
 385     input    = (T.splitOn "-" =<<) <$> out
 386     inp      = toToken <$> input
 387
 388     nt :: Tries Token (I Double)
 389     nt = normalizeEntropy info_entropy_var set_autonomy
 390        . evTrie identity set_entropy_var
 391        . entropyTrie isTerminal
 392        $ buildTries n inp
 393
 394     check f msg ref my =
 395       if f ref my
 396         then P.putStrLn $ "    \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
 397         else P.putStrLn $ "    \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
 398
 399     checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
 400       let ns  = parseToken <$> T.words ngram
 401           nt' = findTrie ns nt
 402
 403       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 404       check (==) "count"        count        (_node_count                  (_fwd nt'))
 405
 406       check sim  "entropy"      entropy      (nodeEntropy info_entropy           nt' )
 407       check sim  "ev"           ev           (nodeEntropy info_entropy_var       nt' )
 408       check sim  "autonomy"     autonomy     (nodeEntropy info_autonomy          nt' )
 409
 410       check sim  "fwd_entropy"  fwd_entropy  (nodeEntropy info_entropy     (_fwd nt'))
 411       check sim  "fwd_ev"       fwd_ev       (nodeEntropy info_entropy_var (_fwd nt'))
 412       check sim  "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy    (_fwd nt'))
 413
 414       check sim  "bwd_entropy"  bwd_entropy  (nodeEntropy info_entropy     (_bwd nt'))
 415       check sim  "bwd_ev"       bwd_ev       (nodeEntropy info_entropy_var (_bwd nt'))
 416       check sim  "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy    (_bwd nt'))
 417
 418 -- | TODO real data is a list of tokenized sentences
 419 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 420 example0 =  ["New-York is New-York and New-York"]
 421 example1 =  ["to-be or not to-be"]
 422 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 423 example3 =  example0 <> example0
 424        -- > TEST: Should not have York New in the trie
 425 example4 =  ["a-b-c-d e a-b-c-d f"]
 426 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 427 example6 =  ["le-petit chat"
 428             ,"le-petit chien"
 429             ,"le-petit rat"
 430             ,"le gros rat"
 431             ]
 432
 433 checks0, checks2 :: Checks Double
 434
 435 checks0 =
 436 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
 437   [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
 438   , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
 439   , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
 440   , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
 441   , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
 442   , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
 443   , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
 444   , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
 445   , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
 446   , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
 447   , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
 448   , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
 449   , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
 450   , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
 451   , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
 452   , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
 453   , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
 454   , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
 455   , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
 456   , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
 457   , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
 458   ]
 459
 460 checks2 = []
 461 {-
 462   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 463   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 464   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 465   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 466   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 467   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 468   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 469   ]
 470 -}
 471
 472 runTests :: IO ()
 473 runTests =
 474   forM_
 475     [("example0", 3, example0, checks0)
 476     ,("example0", 2, example0, [])
 477     ,("example1", 2, example1, [])
 478     ,("example2", 3, example2, checks2)
 479     ,("example3", 2, example3, [])
 480     ,("example4", 4, example4, [])
 481     ,("example5", 5, example5, [])
 482     ,("example6", 2, example6, [])
 483     ]
 484     (\(name, n, ex, checks) -> do
 485       P.putStrLn $ name <> " " <> show n
 486       b <- testEleve False n ex checks
 487       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 488     )