src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
  48 import Control.Monad (forM_)
  49 import Data.Ord (Ord)
  50 import qualified Data.List as L
  51 import Data.Monoid
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import Data.Map (Map)
  55 import Data.Maybe (fromMaybe)
  56 import qualified Data.Map as Map
  57 import Gargantext.Prelude hiding (cs)
  58 import qualified Data.Tree as Tree
  59 import Data.Tree (Tree)
  60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  61
  62 type Entropy e =
  63   ( Fractional e
  64   , Floating e
  65   , P.RealFloat e
  66   , Show e
  67   -- ^ TODO: only used for debugging
  68   )
  69 ------------------------------------------------------------------------
  70 -- | Example and tests for development
  71 data I e = I
  72   { _info_entropy  :: e
  73   , _info_autonomy :: e
  74   }
  75
  76 instance Show e => Show (I e) where
  77   show (I e n) = show (e, n)
  78
  79 makeLenses ''I
  80
  81 type ModEntropy i o e = (e -> e) -> i -> o
  82
  83 set_autonomy :: ModEntropy e (I e) e
  84 set_autonomy f e = I e (f e)
  85
  86 data StartStop = Start | Stop
  87   deriving (Ord, Eq, Show)
  88
  89 data Token = NonTerminal Text
  90            | Terminal StartStop
  91   deriving (Ord, Eq, Show)
  92
  93 isTerminal :: Token -> Bool
  94 isTerminal (Terminal    _) = True
  95 isTerminal (NonTerminal _) = False
  96
  97 parseToken :: Text -> Token
  98 parseToken "<start>" = Terminal Start
  99 parseToken "<stop>"  = Terminal Stop
 100 parseToken t         = NonTerminal t
 101
 102
 103 toToken :: [Text] -> [Token]
 104 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 105
 106 printToken :: Token -> Text
 107 printToken = f
 108   where
 109     f (NonTerminal x)  = x
 110     f (Terminal Start) = "<start>"
 111     f (Terminal Stop)  = "<stop>"
 112
 113 ------------------------------------------------------------------------
 114
 115 data Trie k e
 116   = Node { _node_count    :: Int
 117          , _node_entropy  :: e
 118          , _node_children :: Map k (Trie k e)
 119          }
 120  | Leaf { _node_count    :: Int }
 121   deriving (Show)
 122
 123 makeLenses ''Trie
 124
 125 insertTries :: Ord k => [[k]] -> Trie k ()
 126 insertTries = L.foldr insertTrie emptyTrie
 127
 128 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 129 insertTrie []     n                    = n { _node_count = _node_count n +1}
 130 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 131 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 132   where
 133     f = Just . insertTrie xs . fromMaybe emptyTrie
 134
 135 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 136 -- emptyTrie = Node 0 mempty mempty
 137 emptyTrie :: Trie k e
 138 emptyTrie  = Leaf 0
 139
 140 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 141 mkTrie c children
 142   | Map.null children = Leaf c
 143   | otherwise         = Node c mempty children
 144
 145                         -----------------------------
 146
 147 -- | Trie to Tree since Tree as nice print function
 148 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 149 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 150 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 151
 152 ------------------------------------------------------------------------
 153 ------------------------------------------------------------------------
 154
 155 nan :: Floating e => e
 156 nan = 0 / 0
 157
 158 noNaNs :: P.RealFloat e => [e] -> [e]
 159 noNaNs = filter (not . P.isNaN)
 160
 161 updateIfDefined :: P.RealFloat e => e -> e -> e
 162 updateIfDefined e0 e | P.isNaN e = e0
 163                      | otherwise = e
 164
 165 subst :: Entropy e => (e, e) -> e -> e
 166 subst (src, dst) x | sim src x = dst
 167                    | otherwise = x
 168
 169 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
 170 entropyTrie _    (Leaf c)             = Leaf c
 171 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 172   where
 173     e = sum $ map f $ Map.toList children
 174     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 175                              else - chc * P.logBase 2 chc
 176       where
 177         chc = fromIntegral (_node_count child) / fromIntegral c
 178 ------------------------------------------------------------------------
 179
 180 normalizeLevel :: Entropy e => [e] -> e -> e
 181 normalizeLevel = checkDiff (go . noNaNs)
 182
 183   where
 184     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 185     checkDiff = identity
 186 --  go []  = panic "normalizeLevel: impossible"
 187                         -- trace "normalizeLevel"
 188 --    go [_] = identity
 189     go es  = \e -> (e - m) / v
 190 {-
 191                               in if P.isNaN e'
 192                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 193                                       e
 194                                   else e'
 195 -}
 196       where
 197         m  = mean      es
 198         v  = deviation es
 199
 200 {- Unused
 201
 202 nodeChildren :: Trie k e -> Map k (Trie k e)
 203 nodeChildren (Node _ _ cs) = cs
 204 nodeChildren (Leaf _)      = Map.empty
 205
 206 -}
 207
 208
 209 class IsTrie trie where
 210   buildTrie   :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
 211   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 212   nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
 213   findTrie    :: Ord k => [k] -> trie k e -> trie k e
 214   normalizeEntropy :: Entropy e
 215                    => Getting e i e -> ModEntropy i o e
 216                    -> trie k i -> trie k o
 217
 218 -- UNUSED
 219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 221
 222 instance IsTrie Trie where
 223   buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
 224
 225   nodeEntropy inE (Node _ e _) = e ^. inE
 226   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 227                                  nan
 228
 229   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 230   nodeChild _ (Leaf _)      = emptyTrie
 231
 232   findTrie ks t = L.foldl (flip nodeChild) t ks
 233
 234   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 235     where
 236       go _ []         _                   = panic "normalizeEntropy' empty levels"
 237       go _ _          (Leaf c)            = Leaf c
 238 --      go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 239       go f (es : ess) (Node c i children)
 240     --  | any (sim (i ^. inE)) es
 241         = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 242     --  | otherwise
 243     --  = panic "NOT an elem"
 244
 245
 246   {-
 247   This is only normalizing a node with respect to its brothers (unlike all the
 248   nodes of the same level).
 249
 250   normalizeEntropy inE modE = go $ modE identity
 251     where
 252       go _ (Leaf c) = Leaf c
 253       go f (Node c i children)
 254         | Map.null children =
 255             panic "normalizeEntropy: impossible"
 256         | otherwise         =
 257             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 258           where
 259             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 260   -}
 261 ------------------------------------------------------------------------
 262
 263 levels :: Trie k e -> [[Trie k e]]
 264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 265   where
 266     subForest :: Trie k e -> [Trie k e]
 267     subForest (Leaf _)            = []
 268     subForest (Node _ _ children) = Map.elems children
 269
 270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 272
 273 ------------------------------------------------------------------------
 274
 275 data Tries k e = Tries
 276   { _fwd :: Trie k e
 277   , _bwd :: Trie k e
 278   }
 279
 280 instance IsTrie Tries where
 281   buildTrie to n tts = Tries { _fwd = buildTrie to n tts
 282                              , _bwd = buildTrie to n (map reverse $ tts)
 283                              }
 284
 285   nodeEntropy inE (Tries fwd bwd) =
 286     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 287
 288   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
 289   --                                                              ^^
 290   -- TODO: here this is tempting to reverse but this is not always what we
 291   -- want. See also nodeAutonomy.
 292
 293   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 294
 295   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 296
 297 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 298 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 299
 300 ------------------------------------------------------------------------
 301 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 302 split _   _  [] = []
 303 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 304 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 305   where
 306     consRev [] xss = xss
 307     consRev xs xss = reverse xs : xss
 308
 309     go _ pref []                  = [reverse pref]
 310     go _ pref (Terminal Stop:_)   = [reverse pref]
 311     go t pref (Terminal Start:xs) = go t pref xs
 312     go t pref (x:xs) =
 313         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 314         if acc
 315           then go xt (x:pref) xs
 316           else consRev pref $ go xt0 [x] xs
 317       where
 318         xt   = nodeChild x t
 319         xt0  = nodeChild x t0
 320         et   = ne 0 t
 321     --  ^ entropy of the current prefix
 322         ext0 = ne 0 xt0
 323     --  ^ entropy of [x]
 324         ext  = ne 0 xt
 325     --  ^ entropy of the current prefix plus x
 326         acc  = ext > et + ext0
 327         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 328
 329     ne d t = if P.isNaN e then d else e
 330       where e = nodeEntropy inE t
 331
 332 {-
 333 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 334 split inE t0 ts =
 335   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 336 -}
 337
 338 ------------------------------------------------------------------------
 339 ------------------------------------------------------------------------
 340
 341 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 342 mainEleve _ _ = []
 343 {-
 344 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 345   where
 346     inp = toToken <$> input
 347     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 348 -}
 349
 350 sim :: Entropy e => e -> e -> Bool
 351 sim x y = x == y || (P.isNaN x && P.isNaN y)
 352
 353 chunkAlongEleve :: Int -> [a] -> [[a]]
 354 chunkAlongEleve n xs = L.take n <$> L.tails xs
 355
 356 toToken' :: Int -> [[Text]] -> [[Token]]
 357 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
 358
 359 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 360 testEleve debug n output checks = do
 361   let
 362     {-
 363     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 364           | ps <- L.nub $ [ c
 365                           | m <- [1..n]
 366                           , cs <- chunkAlong m 1 <$> inp
 367                           , c <- cs
 368                           ]
 369           ]
 370     -}
 371   --res = map (map printToken) . split identity fwd <$> inp
 372   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 373     res = map (map printToken) . split info_autonomy nt <$> inp
 374   when debug $ do
 375     P.putStrLn (show input)
 376     -- forM_ pss (P.putStrLn . show)
 377     P.putStrLn ""
 378     P.putStrLn "Levels:"
 379     forM_ (entropyLevels identity (_fwd t)) $ \level ->
 380       P.putStrLn $ "  " <> show level
 381     P.putStrLn ""
 382     P.putStrLn "Forward:"
 383     printTrie (_fwd nt)
 384     P.putStrLn ""
 385     P.putStrLn "Backward:"
 386     printTrie (_bwd nt)
 387     P.putStrLn ""
 388     P.putStrLn "Splitting:"
 389     P.putStrLn $ show res
 390   forM_ checks checker
 391   pure $ expected == res
 392
 393   where
 394     out = T.words <$> output
 395     expected = fmap (T.splitOn "-") <$> out
 396     input = (T.splitOn "-" =<<) <$> out
 397     inp = toToken <$> input
 398     t = buildTrie toToken' n input
 399     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 400     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 401     nt = normalizeEntropy identity set_autonomy t
 402
 403     check f msg ref my =
 404       if f ref my
 405         then P.putStrLn $ "    PASS " <> msg <> " " <> show ref
 406         else P.putStrLn $ "    FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
 407
 408     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 409       let ns  = parseToken <$> T.words ngram
 410           t' = findTrie ns nt
 411           nsb = parseToken <$> (reverse $ T.words ngram)
 412           tb' = findTrie nsb nt
 413
 414       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 415       check (==) "count"       count       (_node_count (_fwd t'))
 416       check sim  "entropy"     entropy     (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
 417
 418       -- (nodeEntropy info_entropy t')
 419       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 420       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 421       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
 422
 423     printTrie =
 424       P.putStrLn . Tree.drawTree
 425                  . fmap show
 426                  . toTree (NonTerminal "")
 427
 428 -- | TODO real data is a list of tokenized sentences
 429 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 430 example0 =  ["New-York is New-York and New-York"]
 431 example1 =  ["to-be or not to-be"]
 432 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 433 example3 =  example0 <> example0
 434        -- > TEST: Should not have York New in the trie
 435 example4 =  ["a-b-c-d e a-b-c-d f"]
 436 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 437 example6 =  ["le-petit chat"
 438             ,"le-petit chien"
 439             ,"le-petit rat"
 440             ,"le gros rat"
 441             ]
 442
 443 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 444
 445 checks0 =
 446   [("<start>", 1, nan, nan, nan, nan, 0.0)
 447   ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 448   ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 449   ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 450   ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 451   ,("<stop>", 0, nan, nan, nan, 0.0, nan)
 452
 453 {-
 454   ,("<start> New", 1, nan, nan, nan, nan, 0.0)
 455   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 456   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 457   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 458   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 459   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 460   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 461
 462   ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
 463   ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
 464   ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
 465   ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
 466   ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
 467   ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
 468   ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
 469   ,("New York <stop>", 1, nan, nan, nan, nan, nan)
 470 --}
 471   ]
 472
 473
 474
 475 checks2 =
 476   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 477   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 478   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 479   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 480   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 481   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 482   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 483   ]
 484
 485
 486 runTests :: IO ()
 487 runTests =
 488   forM_
 489     [("example0", 2, example0, checks0)
 490     ,("example1", 2, example1, [])
 491     ,("example2", 3, example2, checks2)
 492     ,("example3", 2, example3, [])
 493     ,("example4", 4, example4, [])
 494     ,("example5", 5, example5, [])
 495     ]
 496     (\(name, n, ex, checks) -> do
 497       P.putStrLn $ name <> " " <> show n
 498       b <- testEleve False n ex checks
 499       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 500     )