src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Data.Functor.Reverse
  48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed)
  49 import Control.Monad (forM_)
  50 import Data.Ord (Ord)
  51 import qualified Data.List as L
  52 import Data.Monoid
  53 import Data.Text (Text)
  54 import qualified Data.Text as T
  55 import Data.Map (Map)
  56 import Data.Maybe (fromMaybe)
  57 import qualified Data.Map as Map
  58 import Gargantext.Prelude hiding (cs)
  59 import qualified Data.Tree as Tree
  60 import Data.Tree (Tree)
  61 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  62
  63 type Entropy e =
  64   ( Fractional e
  65   , Floating e
  66   , P.RealFloat e
  67   , Show e
  68   -- ^ TODO: only used for debugging
  69   )
  70 ------------------------------------------------------------------------
  71 -- | Example and tests for development
  72 data I e = I
  73   { _info_entropy  :: e
  74   , _info_autonomy :: e
  75   }
  76
  77 instance Show e => Show (I e) where
  78   show (I e n) = show (e, n)
  79
  80 makeLenses ''I
  81
  82 type ModEntropy i o e = (e -> e) -> i -> o
  83
  84 set_autonomy :: ModEntropy e (I e) e
  85 set_autonomy f e = I e (f e)
  86
  87 data StartStop = Start | Stop
  88   deriving (Ord, Eq, Show)
  89
  90 data Token = NonTerminal Text
  91            | Terminal StartStop
  92   deriving (Ord, Eq, Show)
  93
  94 isTerminal :: Token -> Bool
  95 isTerminal (Terminal    _) = True
  96 isTerminal (NonTerminal _) = False
  97
  98 parseToken :: Text -> Token
  99 parseToken "<start>" = Terminal Start
 100 parseToken "<stop>"  = Terminal Stop
 101 parseToken t         = NonTerminal t
 102
 103
 104 toToken :: [Text] -> [Token]
 105 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 106
 107 printToken :: Token -> Text
 108 printToken = f
 109   where
 110     f (NonTerminal x)  = x
 111     f (Terminal Start) = "<start>"
 112     f (Terminal Stop)  = "<stop>"
 113
 114 ------------------------------------------------------------------------
 115
 116 data Trie k e
 117   = Node { _node_count    :: Int
 118          , _node_entropy  :: e
 119          , _node_children :: Map k (Trie k e)
 120          }
 121  | Leaf { _node_count    :: Int }
 122   deriving (Show)
 123
 124 makeLenses ''Trie
 125
 126 insertTries :: Ord k => [[k]] -> Trie k ()
 127 insertTries = L.foldr insertTrie emptyTrie
 128
 129 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 130 insertTrie []     n                    = n { _node_count = _node_count n +1}
 131 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 132 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 133   where
 134     f = Just . insertTrie xs . fromMaybe emptyTrie
 135
 136 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 137 -- emptyTrie = Node 0 mempty mempty
 138 emptyTrie :: Trie k e
 139 emptyTrie  = Leaf 0
 140
 141 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 142 mkTrie c children
 143   | Map.null children = Leaf c
 144   | otherwise         = Node c mempty children
 145
 146                         -----------------------------
 147
 148 -- | Trie to Tree since Tree as nice print function
 149 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 150 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 151 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 152
 153 ------------------------------------------------------------------------
 154 ------------------------------------------------------------------------
 155
 156 nan :: Floating e => e
 157 nan = 0 / 0
 158
 159 noNaNs :: P.RealFloat e => [e] -> [e]
 160 noNaNs = filter (not . P.isNaN)
 161
 162 updateIfDefined :: P.RealFloat e => e -> e -> e
 163 updateIfDefined e0 e | P.isNaN e = e0
 164                      | otherwise = e
 165
 166 subst :: Entropy e => (e, e) -> e -> e
 167 subst (src, dst) x | sim src x = dst
 168                    | otherwise = x
 169
 170 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
 171 entropyTrie _    (Leaf c)             = Leaf c
 172 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 173   where
 174     e = sum $ map f $ Map.toList children
 175     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 176                              else - chc * P.logBase 2 chc
 177       where
 178         chc = fromIntegral (_node_count child) / fromIntegral c
 179 ------------------------------------------------------------------------
 180
 181 normalizeLevel :: Entropy e => [e] -> e -> e
 182 normalizeLevel = checkDiff (go . noNaNs)
 183
 184   where
 185     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 186     checkDiff = identity
 187 --  go []  = panic "normalizeLevel: impossible"
 188                         -- trace "normalizeLevel"
 189 --    go [_] = identity
 190     go es  = \e -> (e - m) / v
 191 {-
 192                               in if P.isNaN e'
 193                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 194                                       e
 195                                   else e'
 196 -}
 197       where
 198         m  = mean      es
 199         v  = deviation es
 200
 201 {- Unused
 202
 203 nodeChildren :: Trie k e -> Map k (Trie k e)
 204 nodeChildren (Node _ _ cs) = cs
 205 nodeChildren (Leaf _)      = Map.empty
 206
 207 -}
 208
 209
 210 class IsTrie trie where
 211   buildTrie   :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
 212   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 213   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 214   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 215   normalizeEntropy :: Entropy e
 216                    => Getting e i e -> ModEntropy i o e
 217                    -> trie k i -> trie k o
 218
 219 -- UNUSED
 220 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 221 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 222
 223 instance IsTrie Trie where
 224   buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
 225
 226   nodeEntropy inE (Node _ e _) = e ^. inE
 227   nodeEntropy _   (Leaf _)     = nan
 228
 229   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 230   nodeChild _ (Leaf _)      = emptyTrie
 231
 232   findTrie ks t = L.foldl (flip nodeChild) t ks
 233
 234   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 235     where
 236       go _ []         _                   = panic "normalizeEntropy' empty levels"
 237       go _ _          (Leaf c)            = Leaf c
 238 --      go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 239       go f (es : ess) (Node c i children)
 240     --  | any (sim (i ^. inE)) es
 241         = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 242     --  | otherwise
 243     --  = panic "NOT an elem"
 244
 245
 246   {-
 247   This is only normalizing a node with respect to its brothers (unlike all the
 248   nodes of the same level).
 249
 250   normalizeEntropy inE modE = go $ modE identity
 251     where
 252       go _ (Leaf c) = Leaf c
 253       go f (Node c i children)
 254         | Map.null children =
 255             panic "normalizeEntropy: impossible"
 256         | otherwise         =
 257             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 258           where
 259             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 260   -}
 261 ------------------------------------------------------------------------
 262
 263 levels :: Trie k e -> [[Trie k e]]
 264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 265   where
 266     subForest :: Trie k e -> [Trie k e]
 267     subForest (Leaf _)            = []
 268     subForest (Node _ _ children) = Map.elems children
 269
 270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 272
 273 ------------------------------------------------------------------------
 274
 275 data Tries k e = Tries
 276   { _fwd :: Trie k e
 277   , _bwd :: Trie k e
 278   }
 279
 280 instance IsTrie Tries where
 281   buildTrie to n tts = Tries { _fwd = buildTrie to n tts
 282                              , _bwd = buildTrie to n (map reverse $ tts)
 283                              }
 284
 285   nodeEntropy inE (Tries fwd bwd) =
 286     -- VETODO reverse the query for bwd here
 287     -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
 288     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 289
 290   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
 291   --                                                              ^^
 292   -- TODO: here this is tempting to reverse but this is not always what we
 293   -- want. See also nodeAutonomy.
 294   -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
 295   -- since recursivity of the function makes the reverse multiple times (I guess)
 296
 297   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 298
 299   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 300
 301 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 302 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 303
 304 ------------------------------------------------------------------------
 305 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 306 split _   _  [] = []
 307 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 308 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 309   where
 310     consRev [] xss = xss
 311     consRev xs xss = reverse xs : xss
 312
 313     go _ pref []                  = [reverse pref]
 314     go _ pref (Terminal Stop:_)   = [reverse pref]
 315     go t pref (Terminal Start:xs) = go t pref xs
 316     go t pref (x:xs) =
 317         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 318         if acc
 319           then go xt (x:pref) xs
 320           else consRev pref $ go xt0 [x] xs
 321       where
 322         xt   = nodeChild x t
 323         xt0  = nodeChild x t0
 324         et   = ne 0 t
 325     --  ^ entropy of the current prefix
 326         ext0 = ne 0 xt0
 327     --  ^ entropy of [x]
 328         ext  = ne 0 xt
 329     --  ^ entropy of the current prefix plus x
 330         acc  = ext > et + ext0
 331         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 332
 333     ne d t = if P.isNaN e then d else e
 334       where e = nodeEntropy inE t
 335
 336 {-
 337 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 338 split inE t0 ts =
 339   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 340 -}
 341
 342 ------------------------------------------------------------------------
 343 ------------------------------------------------------------------------
 344
 345 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 346 mainEleve _ _ = []
 347 {-
 348 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 349   where
 350     inp = toToken <$> input
 351     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 352 -}
 353
 354 sim :: Entropy e => e -> e -> Bool
 355 sim x y = x == y || (P.isNaN x && P.isNaN y)
 356
 357 chunkAlongEleve :: Int -> [a] -> [[a]]
 358 chunkAlongEleve n xs = L.take n <$> L.tails xs
 359
 360 toToken' :: Int -> [[Text]] -> [[Token]]
 361 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
 362
 363 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 364 testEleve debug n output checks = do
 365   let
 366     {-
 367     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 368           | ps <- L.nub $ [ c
 369                           | m <- [1..n]
 370                           , cs <- chunkAlong m 1 <$> inp
 371                           , c <- cs
 372                           ]
 373           ]
 374     -}
 375   --res = map (map printToken) . split identity fwd <$> inp
 376   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 377     res = map (map printToken) . split info_autonomy nt <$> inp
 378   when debug $ do
 379     P.putStrLn (show input)
 380     -- forM_ pss (P.putStrLn . show)
 381     P.putStrLn ""
 382     P.putStrLn "Levels:"
 383     forM_ (entropyLevels identity (_fwd t)) $ \level ->
 384       P.putStrLn $ "  " <> show level
 385     P.putStrLn ""
 386     P.putStrLn "Forward:"
 387     printTrie (_fwd nt)
 388     P.putStrLn ""
 389     P.putStrLn "Backward:"
 390     printTrie (_bwd nt)
 391     P.putStrLn ""
 392     P.putStrLn "Splitting:"
 393     P.putStrLn $ show res
 394   forM_ checks checker
 395   pure $ expected == res
 396
 397   where
 398     out      = T.words <$> output
 399     expected = fmap (T.splitOn "-") <$> out
 400     input    = (T.splitOn "-" =<<) <$> out
 401     inp      = toToken <$> input
 402     t        = buildTrie toToken' n input
 403     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 404     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 405     nt = normalizeEntropy identity set_autonomy t
 406
 407     check f msg ref my =
 408       if f ref my
 409         then P.putStrLn $ "    PASS " <> msg <> " " <> show ref
 410         else P.putStrLn $ "    FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
 411
 412     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 413       let ns  = parseToken <$> T.words ngram
 414           nsb = parseToken <$> (reverse $ T.words ngram)
 415
 416           t'  = findTrie ns  nt
 417           tb' = findTrie nsb nt
 418           -- TODO put this Variation Entropy at VETODO mark above maybe in nodeEntropy ?
 419           ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
 420
 421       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 422       check (==) "count"       count       (_node_count (_fwd t'))
 423       check sim  "entropy"     entropy     ev
 424       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 425       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 426       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
 427
 428     printTrie =
 429       P.putStrLn . Tree.drawTree
 430                  . fmap show
 431                  . toTree (NonTerminal "")
 432
 433 -- | TODO real data is a list of tokenized sentences
 434 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 435 example0 =  ["New-York is New-York and New-York"]
 436 example1 =  ["to-be or not to-be"]
 437 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 438 example3 =  example0 <> example0
 439        -- > TEST: Should not have York New in the trie
 440 example4 =  ["a-b-c-d e a-b-c-d f"]
 441 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 442 example6 =  ["le-petit chat"
 443             ,"le-petit chien"
 444             ,"le-petit rat"
 445             ,"le gros rat"
 446             ]
 447
 448 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 449
 450 checks0 =
 451   [("<start>", 1, nan, nan, nan, nan, 0.0)
 452   ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 453   ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 454   ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 455   ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 456   ,("<stop>", 0, nan, nan, nan, 0.0, nan)
 457
 458 --{-
 459   ,("<start> New", 1, nan, nan, nan, nan, 0.0)
 460   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 461   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 462   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 463   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 464   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 465   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 466
 467   ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
 468   ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
 469   ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
 470   ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
 471   ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
 472   ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
 473   ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
 474   ,("New York <stop>", 1, nan, nan, nan, nan, nan)
 475 --}
 476   ]
 477
 478
 479
 480 checks2 =
 481   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 482   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 483   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 484   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 485   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 486   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 487   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 488   ]
 489
 490
 491 runTests :: IO ()
 492 runTests =
 493   forM_
 494     [("example0", 2, example0, checks0)
 495     ,("example1", 2, example1, [])
 496     ,("example2", 3, example2, checks2)
 497     ,("example3", 2, example3, [])
 498     ,("example4", 4, example4, [])
 499     ,("example5", 5, example5, [])
 500     ]
 501     (\(name, n, ex, checks) -> do
 502       P.putStrLn $ name <> " " <> show n
 503       b <- testEleve False n ex checks
 504       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 505     )