src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed)
  48 import Control.Monad (forM_)
  49 import Data.Ord (Ord)
  50 import qualified Data.List as L
  51 import Data.Monoid
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import Data.Map (Map)
  55 import Data.Maybe (fromMaybe)
  56 import qualified Data.Map as Map
  57 import Gargantext.Prelude hiding (cs)
  58 import qualified Data.Tree as Tree
  59 import Data.Tree (Tree)
  60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  61
  62 type Entropy e =
  63   ( Fractional e
  64   , Floating e
  65   , P.RealFloat e
  66   , Show e
  67   -- ^ TODO: only used for debugging
  68   )
  69 ------------------------------------------------------------------------
  70 -- | Example and tests for development
  71 data I e = I
  72   { _info_entropy  :: e
  73   , _info_autonomy :: e
  74   }
  75
  76 instance Show e => Show (I e) where
  77   show (I e n) = show (e, n)
  78
  79 makeLenses ''I
  80
  81 type ModEntropy i o e = (e -> e) -> i -> o
  82
  83 set_autonomy :: ModEntropy e (I e) e
  84 set_autonomy f e = I e (f e)
  85
  86 data StartStop = Start | Stop
  87   deriving (Ord, Eq, Show)
  88
  89 data Token = NonTerminal Text
  90            | Terminal StartStop
  91   deriving (Ord, Eq, Show)
  92
  93 isTerminal :: Token -> Bool
  94 isTerminal (Terminal    _) = True
  95 isTerminal (NonTerminal _) = False
  96
  97 parseToken :: Text -> Token
  98 parseToken "<start>" = Terminal Start
  99 parseToken "<stop>"  = Terminal Stop
 100 parseToken t         = NonTerminal t
 101
 102
 103 toToken :: [Text] -> [Token]
 104 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 105
 106 printToken :: Token -> Text
 107 printToken = f
 108   where
 109     f (NonTerminal x)  = x
 110     f (Terminal Start) = "<start>"
 111     f (Terminal Stop)  = "<stop>"
 112
 113 ------------------------------------------------------------------------
 114
 115 data Trie k e
 116   = Node { _node_count    :: Int
 117          , _node_entropy  :: e
 118          , _node_children :: Map k (Trie k e)
 119          }
 120  | Leaf { _node_count    :: Int }
 121   deriving (Show)
 122
 123 makeLenses ''Trie
 124
 125 insertTries :: Ord k => [[k]] -> Trie k ()
 126 insertTries = L.foldr insertTrie emptyTrie
 127
 128 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 129 insertTrie []     n                    = n { _node_count = _node_count n +1}
 130 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 131 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 132   where
 133     f = Just . insertTrie xs . fromMaybe emptyTrie
 134
 135 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 136 -- emptyTrie = Node 0 mempty mempty
 137 emptyTrie :: Trie k e
 138 emptyTrie  = Leaf 0
 139
 140 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 141 mkTrie c children
 142   | Map.null children = Leaf c
 143   | otherwise         = Node c mempty children
 144
 145                         -----------------------------
 146
 147 -- | Trie to Tree since Tree as nice print function
 148 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 149 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 150 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 151
 152 ------------------------------------------------------------------------
 153 ------------------------------------------------------------------------
 154
 155 nan :: Floating e => e
 156 nan = 0 / 0
 157
 158 noNaNs :: P.RealFloat e => [e] -> [e]
 159 noNaNs = filter (not . P.isNaN)
 160
 161 updateIfDefined :: P.RealFloat e => e -> e -> e
 162 updateIfDefined e0 e | P.isNaN e = e0
 163                      | otherwise = e
 164
 165 subst :: Entropy e => (e, e) -> e -> e
 166 subst (src, dst) x | sim src x = dst
 167                    | otherwise = x
 168
 169 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
 170 entropyTrie _    (Leaf c)             = Leaf c
 171 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 172   where
 173     e = sum $ map f $ Map.toList children
 174     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 175                              else - chc * P.logBase 2 chc
 176       where
 177         chc = fromIntegral (_node_count child) / fromIntegral c
 178 ------------------------------------------------------------------------
 179
 180 normalizeLevel :: Entropy e => [e] -> e -> e
 181 normalizeLevel = checkDiff (go . noNaNs)
 182
 183   where
 184     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 185     checkDiff = identity
 186 --  go []  = panic "normalizeLevel: impossible"
 187                         -- trace "normalizeLevel"
 188 --    go [_] = identity
 189     go es  = \e -> (e - m) / v
 190 {-
 191                               in if P.isNaN e'
 192                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 193                                       e
 194                                   else e'
 195 -}
 196       where
 197         m  = mean      es
 198         v  = deviation es
 199
 200 {- Unused
 201
 202 nodeChildren :: Trie k e -> Map k (Trie k e)
 203 nodeChildren (Node _ _ cs) = cs
 204 nodeChildren (Leaf _)      = Map.empty
 205
 206 -}
 207
 208
 209 class IsTrie trie where
 210   buildTrie   :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
 211   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 212   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 213   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 214   normalizeEntropy :: Entropy e
 215                    => Getting e i e -> ModEntropy i o e
 216                    -> trie k i -> trie k o
 217
 218 -- UNUSED
 219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 221
 222 instance IsTrie Trie where
 223   buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
 224
 225   nodeEntropy inE (Node _ e _) = e ^. inE
 226   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 227                                  nan
 228
 229   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 230   nodeChild _ (Leaf _)      = emptyTrie
 231
 232   findTrie ks t = L.foldl (flip nodeChild) t ks
 233
 234   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 235     where
 236       go _ []         _                   = panic "normalizeEntropy' empty levels"
 237       go _ _          (Leaf c)            = Leaf c
 238 --      go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 239       go f (es : ess) (Node c i children)
 240     --  | any (sim (i ^. inE)) es
 241         = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 242     --  | otherwise
 243     --  = panic "NOT an elem"
 244
 245
 246   {-
 247   This is only normalizing a node with respect to its brothers (unlike all the
 248   nodes of the same level).
 249
 250   normalizeEntropy inE modE = go $ modE identity
 251     where
 252       go _ (Leaf c) = Leaf c
 253       go f (Node c i children)
 254         | Map.null children =
 255             panic "normalizeEntropy: impossible"
 256         | otherwise         =
 257             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 258           where
 259             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 260   -}
 261 ------------------------------------------------------------------------
 262
 263 levels :: Trie k e -> [[Trie k e]]
 264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 265   where
 266     subForest :: Trie k e -> [Trie k e]
 267     subForest (Leaf _)            = []
 268     subForest (Node _ _ children) = Map.elems children
 269
 270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 272
 273 ------------------------------------------------------------------------
 274
 275 data Tries k e = Tries
 276   { _fwd :: Trie k e
 277   , _bwd :: Trie k e
 278   }
 279
 280 instance IsTrie Tries where
 281   buildTrie to n tts = Tries { _fwd = buildTrie to n tts
 282                              , _bwd = buildTrie to n (map reverse $ tts)
 283                              }
 284
 285   nodeEntropy inE (Tries fwd bwd) =
 286     -- VETODO reverse the query for bwd here
 287     -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
 288     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 289
 290   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
 291   --                                                              ^^
 292   -- TODO: here this is tempting to reverse but this is not always what we
 293   -- want. See also nodeAutonomy.
 294
 295   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 296
 297   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 298
 299 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 300 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 301
 302 ------------------------------------------------------------------------
 303 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 304 split _   _  [] = []
 305 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 306 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 307   where
 308     consRev [] xss = xss
 309     consRev xs xss = reverse xs : xss
 310
 311     go _ pref []                  = [reverse pref]
 312     go _ pref (Terminal Stop:_)   = [reverse pref]
 313     go t pref (Terminal Start:xs) = go t pref xs
 314     go t pref (x:xs) =
 315         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 316         if acc
 317           then go xt (x:pref) xs
 318           else consRev pref $ go xt0 [x] xs
 319       where
 320         xt   = nodeChild x t
 321         xt0  = nodeChild x t0
 322         et   = ne 0 t
 323     --  ^ entropy of the current prefix
 324         ext0 = ne 0 xt0
 325     --  ^ entropy of [x]
 326         ext  = ne 0 xt
 327     --  ^ entropy of the current prefix plus x
 328         acc  = ext > et + ext0
 329         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 330
 331     ne d t = if P.isNaN e then d else e
 332       where e = nodeEntropy inE t
 333
 334 {-
 335 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 336 split inE t0 ts =
 337   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 338 -}
 339
 340 ------------------------------------------------------------------------
 341 ------------------------------------------------------------------------
 342
 343 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 344 mainEleve _ _ = []
 345 {-
 346 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 347   where
 348     inp = toToken <$> input
 349     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 350 -}
 351
 352 sim :: Entropy e => e -> e -> Bool
 353 sim x y = x == y || (P.isNaN x && P.isNaN y)
 354
 355 chunkAlongEleve :: Int -> [a] -> [[a]]
 356 chunkAlongEleve n xs = L.take n <$> L.tails xs
 357
 358 toToken' :: Int -> [[Text]] -> [[Token]]
 359 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
 360
 361 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 362 testEleve debug n output checks = do
 363   let
 364     {-
 365     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 366           | ps <- L.nub $ [ c
 367                           | m <- [1..n]
 368                           , cs <- chunkAlong m 1 <$> inp
 369                           , c <- cs
 370                           ]
 371           ]
 372     -}
 373   --res = map (map printToken) . split identity fwd <$> inp
 374   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 375     res = map (map printToken) . split info_autonomy nt <$> inp
 376   when debug $ do
 377     P.putStrLn (show input)
 378     -- forM_ pss (P.putStrLn . show)
 379     P.putStrLn ""
 380     P.putStrLn "Levels:"
 381     forM_ (entropyLevels identity (_fwd t)) $ \level ->
 382       P.putStrLn $ "  " <> show level
 383     P.putStrLn ""
 384     P.putStrLn "Forward:"
 385     printTrie (_fwd nt)
 386     P.putStrLn ""
 387     P.putStrLn "Backward:"
 388     printTrie (_bwd nt)
 389     P.putStrLn ""
 390     P.putStrLn "Splitting:"
 391     P.putStrLn $ show res
 392   forM_ checks checker
 393   pure $ expected == res
 394
 395   where
 396     out = T.words <$> output
 397     expected = fmap (T.splitOn "-") <$> out
 398     input = (T.splitOn "-" =<<) <$> out
 399     inp = toToken <$> input
 400     t = buildTrie toToken' n input
 401     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 402     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 403     nt = normalizeEntropy identity set_autonomy t
 404
 405     check f msg ref my =
 406       if f ref my
 407         then P.putStrLn $ "    PASS " <> msg <> " " <> show ref
 408         else P.putStrLn $ "    FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
 409
 410     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 411       let ns  = parseToken <$> T.words ngram
 412           t' = findTrie ns nt
 413           nsb = parseToken <$> (reverse $ T.words ngram)
 414           tb' = findTrie nsb nt
 415           -- TODO put this Variation Entropy at VETODO mark above
 416           ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
 417
 418       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 419       check (==) "count"       count       (_node_count (_fwd t'))
 420       check sim  "entropy"     entropy     ev
 421       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 422       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 423       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
 424
 425     printTrie =
 426       P.putStrLn . Tree.drawTree
 427                  . fmap show
 428                  . toTree (NonTerminal "")
 429
 430 -- | TODO real data is a list of tokenized sentences
 431 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 432 example0 =  ["New-York is New-York and New-York"]
 433 example1 =  ["to-be or not to-be"]
 434 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 435 example3 =  example0 <> example0
 436        -- > TEST: Should not have York New in the trie
 437 example4 =  ["a-b-c-d e a-b-c-d f"]
 438 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 439 example6 =  ["le-petit chat"
 440             ,"le-petit chien"
 441             ,"le-petit rat"
 442             ,"le gros rat"
 443             ]
 444
 445 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 446
 447 checks0 =
 448   [("<start>", 1, nan, nan, nan, nan, 0.0)
 449   ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 450   ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 451   ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 452   ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 453   ,("<stop>", 0, nan, nan, nan, 0.0, nan)
 454
 455 {-
 456   ,("<start> New", 1, nan, nan, nan, nan, 0.0)
 457   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 458   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 459   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 460   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 461   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 462   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 463
 464   ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
 465   ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
 466   ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
 467   ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
 468   ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
 469   ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
 470   ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
 471   ,("New York <stop>", 1, nan, nan, nan, nan, nan)
 472 --}
 473   ]
 474
 475
 476
 477 checks2 =
 478   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 479   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 480   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 481   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 482   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 483   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 484   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 485   ]
 486
 487
 488 runTests :: IO ()
 489 runTests =
 490   forM_
 491     [("example0", 2, example0, checks0)
 492     ,("example1", 2, example1, [])
 493     ,("example2", 3, example2, checks2)
 494     ,("example3", 2, example3, [])
 495     ,("example4", 4, example4, [])
 496     ,("example5", 5, example5, [])
 497     ]
 498     (\(name, n, ex, checks) -> do
 499       P.putStrLn $ name <> " " <> show n
 500       b <- testEleve False n ex checks
 501       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 502     )