src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  25
  26 - AD: Real ngrams extraction test
  27   from Gargantext.Text.Terms import extractTermsUnsupervised
  28   docs <- runCmdRepl $ selectDocs 1004
  29   extractTermsUnsupervised 3 $ DT.intercalate " "
  30                         $ catMaybes
  31                         $ Gargantext.map _hyperdataDocument_abstract docs
  32
  33 -}
  34 {-# LANGUAGE ConstraintKinds   #-}
  35 {-# LANGUAGE NoImplicitPrelude #-}
  36 {-# LANGUAGE OverloadedStrings #-}
  37 {-# LANGUAGE RankNTypes        #-}
  38 {-# LANGUAGE TemplateHaskell   #-}
  39 {-# LANGUAGE TypeFamilies      #-}
  40
  41 module Gargantext.Text.Eleve where
  42
  43 -- import Debug.Trace (trace)
  44 -- import Debug.SimpleReflect
  45
  46 import Control.Lens hiding (levels, children)
  47 import Control.Monad (forM_)
  48 import Data.Ord (Ord)
  49 import qualified Data.List as L
  50 import Data.Monoid
  51 import Data.Text (Text)
  52 import qualified Data.Text as T
  53 import Data.Map (Map)
  54 import Data.Maybe (fromMaybe)
  55 import qualified Data.Map as Map
  56 import Gargantext.Prelude hiding (cs)
  57 import qualified Data.Tree as Tree
  58 import Data.Tree (Tree)
  59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  60
  61 nan :: Floating e => e
  62 nan = 0 / 0
  63
  64 noNaNs :: P.RealFloat e => [e] -> [e]
  65 noNaNs = filter (not . P.isNaN)
  66
  67 updateIfDefined :: P.RealFloat e => e -> e -> e
  68 updateIfDefined e0 e | P.isNaN e = e0
  69                      | otherwise = e
  70
  71 sim :: Entropy e => e -> e -> Bool
  72 sim x y = x == y || (P.isNaN x && P.isNaN y)
  73
  74 subst :: Entropy e => (e, e) -> e -> e
  75 subst (src, dst) x | sim src x = dst
  76                    | otherwise = x
  77 ------------------------------------------------------------------------
  78
  79 type Entropy e =
  80   ( Fractional e
  81   , Floating e
  82   , P.RealFloat e
  83   , Show e
  84   -- ^ TODO: only used for debugging
  85   )
  86 ------------------------------------------------------------------------
  87 -- | Example and tests for development
  88 data I e = I
  89   { _info_entropy    :: e
  90   , _info_entropy_var :: e
  91   , _info_autonomy   :: e
  92   }
  93
  94 instance Show e => Show (I e) where
  95   show (I e ev a) = show (e, ev, a)
  96
  97 makeLenses ''I
  98
  99 type ModEntropy i o e = (e -> e) -> i -> o
 100
 101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
 102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
 103
 104 set_entropy_var :: Entropy e => Setter e (I e) e e
 105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
 106
 107 data StartStop = Start | Stop
 108   deriving (Ord, Eq, Show)
 109
 110 data Token = NonTerminal Text
 111            | Terminal StartStop
 112   deriving (Ord, Eq, Show)
 113
 114 isTerminal :: Token -> Bool
 115 isTerminal (Terminal    _) = True
 116 isTerminal (NonTerminal _) = False
 117
 118 nonTerminals :: [Token] -> [Text]
 119 nonTerminals ts = [nt | NonTerminal nt <- ts]
 120
 121 parseToken :: Text -> Token
 122 parseToken "<start>" = Terminal Start
 123 parseToken "<stop>"  = Terminal Stop
 124 parseToken t         = NonTerminal t
 125
 126 toToken :: [Text] -> [Token]
 127 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 128
 129 printToken :: Token -> Text
 130 printToken = f
 131   where
 132     f (NonTerminal x)  = x
 133     f (Terminal Start) = "<start>"
 134     f (Terminal Stop)  = "<stop>"
 135 ------------------------------------------------------------------------
 136
 137 data Trie k e
 138   = Node { _node_count    :: Int
 139          , _node_entropy  :: e
 140          , _node_children :: Map k (Trie k e)
 141          }
 142  | Leaf { _node_count    :: Int }
 143   deriving (Show)
 144
 145 makeLenses ''Trie
 146
 147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 148 insertTrie []     n                    = n { _node_count = _node_count n +1}
 149 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 151   where
 152     f = Just . insertTrie xs . fromMaybe emptyTrie
 153
 154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 155 -- emptyTrie = Node 0 mempty mempty
 156 emptyTrie :: Trie k e
 157 emptyTrie  = Leaf 0
 158
 159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 160 mkTrie c children
 161   | Map.null children = Leaf c
 162   | otherwise         = Node c mempty children
 163
 164                         -----------------------------
 165 -- | Trie to Tree since Tree as nice print function
 166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 167 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 168 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 169
 170 ------------------------------------------------------------------------
 171 ------------------------------------------------------------------------
 172 normalizeLevel :: Entropy e => e -> e -> e -> e
 173 normalizeLevel m v e = (e - m) / v
 174
 175 {- Unused
 176
 177 nodeChildren :: Trie k e -> Map k (Trie k e)
 178 nodeChildren (Node _ _ cs) = cs
 179 nodeChildren (Leaf _)      = Map.empty
 180
 181 -}
 182
 183 chunkAlongEleve :: Int -> [a] -> [[a]]
 184 chunkAlongEleve n xs = L.take n <$> L.tails xs
 185
 186 data Direction = Backward | Forward
 187
 188 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
 189 buildTrie d n sentences
 190   = L.foldr insertTrie emptyTrie
 191   . L.concat
 192   $ ( filter (/= [Terminal (term d)])
 193     . chunkAlongEleve (n + 1)
 194     . order d
 195     )
 196  <$> sentences
 197   where
 198     order Forward  = identity
 199     order Backward = reverse
 200     term  Forward  = Stop
 201     term  Backward = Start
 202
 203 class IsTrie trie where
 204   entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
 205   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 206   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 207   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 208   printTrie   :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
 209   evTrie      :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
 210   normalizeEntropy :: Entropy e
 211                    => Getting e i e -> ModEntropy i o e
 212                    -> trie k i -> trie k o
 213
 214 instance IsTrie Trie where
 215
 216   entropyTrie _    (Leaf c)             = Leaf c
 217   entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 218     where
 219       children' = Map.toList children
 220       sum_count = sum $ _node_count . snd <$> children'
 221       e | sum_count == 0 = nan
 222         | otherwise      = sum $ f <$> children'
 223       f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 224                               else - chc * P.logBase 2 chc
 225         where
 226           chc = fromIntegral (_node_count child) / fromIntegral c
 227
 228   nodeEntropy inE (Node _ e _) = e ^. inE
 229   nodeEntropy _   (Leaf _)     = nan
 230
 231   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 232   nodeChild _ (Leaf _)      = emptyTrie
 233
 234   findTrie ks t = L.foldl (flip nodeChild) t ks
 235
 236   printTrie inE t = do
 237     P.putStrLn . Tree.drawTree
 238                 . fmap show
 239                 $ toTree (NonTerminal "") t
 240     P.putStrLn "  Levels:"
 241     forM_ (normalizationLevels inE t) $ \level ->
 242       P.putStrLn $ "    " <> show level
 243
 244   evTrie inE setEV = go nan
 245     where
 246       go _  (Leaf c)            = Leaf c
 247       go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
 248         where e1 = i ^. inE
 249
 250       ev 0  0  = nan
 251       ev i0 i1 = i1 - i0
 252
 253   normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
 254     where
 255       go _ _                 (Leaf c)            = Leaf c
 256       go _ []                _                   = panic "normalizeEntropy' empty levels"
 257       go f ((m, v, _) : ess) (Node c i children)
 258         = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
 259 ------------------------------------------------------------------------
 260
 261 levels :: Trie k e -> [[Trie k e]]
 262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 263   where
 264     subForest :: Trie k e -> [Trie k e]
 265     subForest (Leaf _)            = []
 266     subForest (Node _ _ children) = Map.elems children
 267
 268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
 270
 271 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
 272 normalizationLevels inE = fmap f . entropyLevels inE
 273   where
 274     f es = (mean es, deviation es, length es)
 275
 276 ------------------------------------------------------------------------
 277
 278 data Tries k e = Tries
 279   { _fwd :: Trie k e
 280   , _bwd :: Trie k e
 281   }
 282
 283 makeLenses ''Tries
 284
 285 buildTries :: Int -> [[Token]] -> Tries Token ()
 286 buildTries n sentences = Tries
 287   { _fwd = buildTrie Forward  n sentences
 288   , _bwd = buildTrie Backward n sentences
 289   }
 290
 291 instance IsTrie Tries where
 292
 293   nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
 294
 295   findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
 296
 297   nodeChild = onTries . nodeChild
 298
 299   entropyTrie = onTries . entropyTrie
 300
 301   evTrie inE setEV = onTries $ evTrie inE setEV
 302
 303   normalizeEntropy inE = onTries . normalizeEntropy inE
 304
 305   printTrie inE (Tries f b) = do
 306     P.putStrLn "Forward:"
 307     printTrie inE f
 308     P.putStrLn ""
 309     P.putStrLn "Backward:"
 310     printTrie inE b
 311
 312 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 313 onTries h (Tries f b) = Tries (h f) (h b)
 314
 315 ------------------------------------------------------------------------
 316 mayCons :: [a] -> [[a]] -> [[a]]
 317 mayCons [] xss = xss
 318 mayCons xs xss = xs : xss
 319
 320 {-
 321 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 322 split _   _ [] = []
 323 split inE t (Terminal Start:xs) = split inE t xs
 324 split inE t (x0:xs0) = go [x0] xs0
 325   where
 326     go pref []                  = [pref]
 327     go pref (Terminal Stop:_)   = [pref]
 328     go _    (Terminal Start:_)  = panic "split impossible"
 329     go pref (x:xs) =
 330         -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
 331         if acc
 332           then go prefx xs
 333           else mayCons pref $ go [x] xs
 334       where
 335         prefx = pref <> [x]
 336         pt   = findTrie pref t
 337         pxt  = findTrie prefx t
 338         xt   = findTrie [x] t
 339         ept  = ne pt
 340     --  ^ entropy of the current prefix
 341         ext  = ne xt
 342     --  ^ entropy of [x]
 343         epxt = ne pxt
 344     --  ^ entropy of the current prefix plus x
 345         acc  = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
 346
 347         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 348
 349     ne = nodeEntropy inE
 350 -}
 351
 352 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
 353 split _ _   _ []  = []
 354 split _ _   _ [t] = pure <$> nonTerminals [t]
 355 split n inE t ts  = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
 356   where
 357     pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
 358                        (L.tail . L.inits . take n $ ts)
 359
 360
 361 {-
 362 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 363 split inE t0 ts =
 364   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 365 -}
 366
 367 ------------------------------------------------------------------------
 368
 369 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 370 mainEleve n input = split n info_autonomy (t :: Tries Token (I Double)) <$> inp
 371   where
 372     inp = toToken <$> input
 373     t   = normalizeEntropy info_entropy_var set_autonomy
 374         . evTrie identity set_entropy_var
 375         . entropyTrie isTerminal
 376         $ buildTries n inp
 377
 378
 379 ---------------------------------------------
 380
 381 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
 382
 383 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
 384 testEleve debug n output checks = do
 385   let
 386     res = split n info_autonomy nt <$> inp
 387   when debug $ do
 388     P.putStrLn $ show input
 389     P.putStrLn ""
 390     printTrie info_entropy nt
 391     P.putStrLn ""
 392     P.putStrLn "Splitting:"
 393     P.putStrLn $ show res
 394   forM_ checks checker
 395   pure $ expected == res
 396
 397   where
 398     out      = T.words <$> output
 399     expected = fmap (T.splitOn "-") <$> out
 400     input    = (T.splitOn "-" =<<) <$> out
 401     inp      = toToken <$> input
 402
 403     nt :: Tries Token (I Double)
 404     nt = normalizeEntropy info_entropy_var set_autonomy
 405        . evTrie identity set_entropy_var
 406        . entropyTrie isTerminal
 407        $ buildTries n inp
 408
 409     check f msg ref my =
 410       if f ref my
 411         then P.putStrLn $ "    \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
 412         else P.putStrLn $ "    \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
 413
 414     checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
 415       let ns  = parseToken <$> T.words ngram
 416           nt' = findTrie ns nt
 417
 418       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 419       check (==) "count"        count        (_node_count                  (_fwd nt'))
 420
 421       check sim  "entropy"      entropy      (nodeEntropy info_entropy           nt' )
 422       check sim  "ev"           ev           (nodeEntropy info_entropy_var       nt' )
 423       check sim  "autonomy"     autonomy     (nodeEntropy info_autonomy          nt' )
 424
 425       check sim  "fwd_entropy"  fwd_entropy  (nodeEntropy info_entropy     (_fwd nt'))
 426       check sim  "fwd_ev"       fwd_ev       (nodeEntropy info_entropy_var (_fwd nt'))
 427       check sim  "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy    (_fwd nt'))
 428
 429       check sim  "bwd_entropy"  bwd_entropy  (nodeEntropy info_entropy     (_bwd nt'))
 430       check sim  "bwd_ev"       bwd_ev       (nodeEntropy info_entropy_var (_bwd nt'))
 431       check sim  "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy    (_bwd nt'))
 432
 433 -- | TODO real data is a list of tokenized sentences
 434 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 435 example0 =  ["New-York is New-York and New-York"]
 436 example1 =  ["to-be or not to-be"]
 437 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 438 example3 =  example0 <> example0
 439        -- > TEST: Should not have York New in the trie
 440 example4 =  ["a-b-c-d e a-b-c-d f"]
 441 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 442 example6 =  ["le-petit chat"
 443             ,"le-petit chien"
 444             ,"le-petit rat"
 445             ,"le gros rat"
 446             ]
 447
 448 checks0, checks2 :: Checks Double
 449
 450 checks0 =
 451 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
 452   [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
 453   , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
 454   , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
 455   , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
 456   , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
 457   , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
 458   , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
 459   , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
 460   , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
 461   , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
 462   , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
 463   , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
 464   , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
 465   , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
 466   , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
 467   , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
 468   , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
 469   , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
 470   , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
 471   , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
 472   , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
 473   ]
 474
 475 checks2 = []
 476 {-
 477   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 478   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 479   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 480   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 481   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 482   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 483   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 484   ]
 485 -}
 486
 487 runTests :: IO ()
 488 runTests =
 489   forM_
 490     [("example0", 3, example0, checks0)
 491     ,("example0", 2, example0, [])
 492     ,("example1", 2, example1, [])
 493     ,("example2", 3, example2, checks2)
 494     ,("example3", 2, example3, [])
 495     ,("example4", 4, example4, [])
 496     ,("example5", 5, example5, [])
 497     ,("example6", 2, example6, [])
 498     ]
 499     (\(name, n, ex, checks) -> do
 500       P.putStrLn $ name <> " " <> show n
 501       b <- testEleve False n ex checks
 502       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 503     )