src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
  48 import Control.Monad (foldM, mapM_, forM_)
  49 import Data.Ord (Ord)
  50 import qualified Data.List as L
  51 import Data.Monoid
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import Data.Map (Map)
  55 import Data.Maybe (fromMaybe, catMaybes)
  56 import qualified Data.Map as Map
  57 import Gargantext.Prelude hiding (cs)
  58 import qualified Data.Tree as Tree
  59 import Data.Tree (Tree)
  60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  61
  62 type Entropy e =
  63   ( Fractional e
  64   , Floating e
  65   , P.RealFloat e
  66   , Show e
  67   -- ^ TODO: only used for debugging
  68   )
  69 ------------------------------------------------------------------------
  70 -- | Example and tests for development
  71 data I e = I
  72   { _info_entropy  :: e
  73   , _info_autonomy :: e
  74   }
  75
  76 instance Show e => Show (I e) where
  77   show (I e n) = show (e, n)
  78
  79 makeLenses ''I
  80
  81 type ModEntropy i o e = (e -> e) -> i -> o
  82
  83 set_autonomy :: ModEntropy e (I e) e
  84 set_autonomy f e = I e (f e)
  85
  86 data StartStop = Start | Stop
  87   deriving (Ord, Eq, Show)
  88
  89 data Token = NonTerminal Text
  90            | Terminal StartStop
  91   deriving (Ord, Eq, Show)
  92
  93 isTerminal :: Token -> Bool
  94 isTerminal (Terminal    _) = True
  95 isTerminal (NonTerminal _) = False
  96
  97 parseToken :: Text -> Token
  98 parseToken "<start>" = Terminal Start
  99 parseToken "<stop>"  = Terminal Stop
 100 parseToken t         = NonTerminal t
 101
 102 toToken :: [Text] -> [Token]
 103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 104
 105 printToken :: Token -> Text
 106 printToken = f
 107   where
 108     f (NonTerminal x)  = x
 109     f (Terminal Start) = "<start>"
 110     f (Terminal Stop)  = "<stop>"
 111
 112 ------------------------------------------------------------------------
 113
 114 data Trie k e
 115   = Node { _node_count    :: Int
 116          , _node_entropy  :: e
 117          , _node_children :: Map k (Trie k e)
 118          }
 119  | Leaf { _node_count    :: Int }
 120   deriving (Show)
 121
 122 makeLenses ''Trie
 123
 124 insertTries :: Ord k => [[k]] -> Trie k ()
 125 insertTries = L.foldr insertTrie emptyTrie
 126
 127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 128 insertTrie []     n                    = n { _node_count = _node_count n +1}
 129 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 131   where
 132     f = Just . insertTrie xs . fromMaybe emptyTrie
 133
 134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 135 -- emptyTrie = Node 0 mempty mempty
 136 emptyTrie :: Trie k e
 137 emptyTrie  = Leaf 0
 138
 139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 140 mkTrie c children
 141   | Map.null children = Leaf c
 142   | otherwise         = Node c mempty children
 143
 144                         -----------------------------
 145
 146 -- | Trie to Tree since Tree as nice print function
 147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 148 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 149 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 150
 151 ------------------------------------------------------------------------
 152 ------------------------------------------------------------------------
 153
 154 nan :: Floating e => e
 155 nan = 0 / 0
 156
 157 noNaNs :: P.RealFloat e => [e] -> [e]
 158 noNaNs = filter (not . P.isNaN)
 159
 160 updateIfDefined :: P.RealFloat e => e -> e -> e
 161 updateIfDefined e0 e | P.isNaN e = e0
 162                      | otherwise = e
 163
 164 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
 165 entropyTrie _    (Leaf c)             = Leaf c
 166 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 167   where
 168     e = sum $ map f $ Map.toList children
 169     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 170                              else - chc * P.logBase 2 chc
 171       where
 172         chc = fromIntegral (_node_count child) / fromIntegral c
 173 ------------------------------------------------------------------------
 174
 175 normalizeLevel :: Entropy e => [e] -> e -> e
 176 normalizeLevel = checkDiff (go . noNaNs)
 177
 178   where
 179     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 180     checkDiff = identity
 181     go []  = panic "normalizeLevel: impossible"
 182                         -- trace "normalizeLevel"
 183 --    go [_] = identity
 184     go es  = \e -> (e - m) / v
 185 {-
 186                               in if P.isNaN e'
 187                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 188                                       e
 189                                   else e'
 190 -}
 191       where
 192         m  = mean      es
 193         v  = deviation es
 194
 195 {- Unused
 196
 197 nodeChildren :: Trie k e -> Map k (Trie k e)
 198 nodeChildren (Node _ _ cs) = cs
 199 nodeChildren (Leaf _)      = Map.empty
 200
 201 -}
 202
 203 class IsTrie trie where
 204   buildTrie :: Floating e => [[Token]] -> trie Token e
 205   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 206   nodeChild :: Ord k => k -> trie k e -> trie k e
 207   findTrie :: Ord k => [k] -> trie k e -> trie k e
 208   normalizeEntropy :: Entropy e
 209                    => Getting e i e -> ModEntropy i o e
 210                    -> trie k i -> trie k o
 211
 212 -- UNUSED
 213 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 214 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 215
 216 instance IsTrie Trie where
 217   buildTrie = entropyTrie isTerminal . insertTries
 218
 219   nodeEntropy inE (Node _ e _) = e ^. inE
 220   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 221                                  nan
 222
 223   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 224   nodeChild _ (Leaf _)      = emptyTrie
 225
 226   findTrie ks t = L.foldl (flip nodeChild) t ks
 227
 228   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 229     where
 230       go _ []         _                   = panic "normalizeEntropy' empty levels"
 231       go _ _          (Leaf c)            = Leaf c
 232       go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 233       go f (es : ess) (Node c i children) =
 234           Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 235
 236
 237   {-
 238   This is only normalizing a node with respect to its brothers (unlike all the
 239   nodes of the same level).
 240
 241   normalizeEntropy inE modE = go $ modE identity
 242     where
 243       go _ (Leaf c) = Leaf c
 244       go f (Node c i children)
 245         | Map.null children =
 246             panic "normalizeEntropy: impossible"
 247         | otherwise         =
 248             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 249           where
 250             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 251   -}
 252 ------------------------------------------------------------------------
 253
 254 levels :: Trie k e -> [[Trie k e]]
 255 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 256   where
 257     subForest :: Trie k e -> [Trie k e]
 258     subForest (Leaf _)            = []
 259     subForest (Node _ _ children) = Map.elems children
 260
 261 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 262 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 263
 264 ------------------------------------------------------------------------
 265
 266 data Tries k e = Tries
 267   { _fwd :: Trie k e
 268   , _bwd :: Trie k e
 269   }
 270
 271 instance IsTrie Tries where
 272   buildTrie tts = Tries { _fwd = buildTrie tts
 273                         , _bwd = buildTrie (reverse <$> tts)
 274                         }
 275
 276   nodeEntropy inE (Tries fwd bwd) =
 277     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 278
 279   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
 280   --                                                              ^^
 281   -- TODO: here this is tempting to reverse but this is not always what we
 282   -- want. See also nodeAutonomy.
 283
 284   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 285
 286   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 287
 288 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 289 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 290
 291 ------------------------------------------------------------------------
 292 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 293 split _   _  [] = []
 294 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 295 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 296   where
 297     consRev [] xss = xss
 298     consRev xs xss = reverse xs : xss
 299
 300     go _ pref []                  = [reverse pref]
 301     go _ pref (Terminal Stop:_)   = [reverse pref]
 302     go t pref (Terminal Start:xs) = go t pref xs
 303     go t pref (x:xs) =
 304         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 305         if acc
 306           then go xt (x:pref) xs
 307           else consRev pref $ go xt0 [x] xs
 308       where
 309         xt   = nodeChild x t
 310         xt0  = nodeChild x t0
 311         et   = ne 0 t
 312     --  ^ entropy of the current prefix
 313         ext0 = ne 0 xt0
 314     --  ^ entropy of [x]
 315         ext  = ne 0 xt
 316     --  ^ entropy of the current prefix plus x
 317         acc  = ext > et + ext0
 318         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 319
 320     ne d t = if P.isNaN e then d else e
 321       where e = nodeEntropy inE t
 322
 323 {-
 324 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 325 split inE t0 ts =
 326   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 327 -}
 328
 329 ------------------------------------------------------------------------
 330 ------------------------------------------------------------------------
 331
 332 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 333 mainEleve _ _ = []
 334 {-
 335 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 336   where
 337     inp = toToken <$> input
 338     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 339 -}
 340
 341 sim :: Entropy e => e -> e -> Bool
 342 sim x y = x == y || (P.isNaN x && P.isNaN y)
 343
 344 chunkAlongEleve :: Int -> [a] -> [[a]]
 345 chunkAlongEleve n xs = L.take n <$> L.tails xs
 346
 347 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 348 testEleve debug n output checks = do
 349   let
 350     {-
 351     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 352           | ps <- L.nub $ [ c
 353                           | m <- [1..n]
 354                           , cs <- chunkAlong m 1 <$> inp
 355                           , c <- cs
 356                           ]
 357           ]
 358     -}
 359   --res = map (map printToken) . split identity fwd <$> inp
 360   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 361     res = map (map printToken) . split info_autonomy nt <$> inp
 362   when debug $ do
 363     P.putStrLn (show input)
 364     -- mapM_ (P.putStrLn . show) pss
 365     P.putStrLn ""
 366 --    printTrie nt
 367     printTrie (_fwd nt)
 368     printTrie (_bwd nt)
 369     P.putStrLn $ show res
 370   forM_ checks checker
 371   pure $ expected == res
 372
 373   where
 374     out = T.words <$> output
 375     expected = fmap (T.splitOn "-") <$> out
 376     input = (T.splitOn "-" =<<) <$> out
 377     inp = toToken <$> input
 378     t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
 379     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 380     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 381     nt = normalizeEntropy identity set_autonomy t
 382
 383     check f msg ref my =
 384       if f ref my
 385         then P.putStrLn $ "    PASS " <> msg <> " " <> show ref
 386         else P.putStrLn $ "    FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
 387
 388     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 389       let ns = parseToken <$> T.words ngram
 390           t' = findTrie ns nt
 391       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 392       check (==) "count"       count       (_node_count (_fwd t'))
 393       check sim  "entropy"     entropy     (nodeEntropy info_entropy t')
 394       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 395       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 396       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
 397
 398     printTrie =
 399       P.putStrLn . Tree.drawTree
 400                  . fmap show
 401                  . toTree (NonTerminal "")
 402
 403 -- | TODO real data is a list of tokenized sentences
 404 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 405 example0 =  ["New-York is New-York and New-York"]
 406 example1 =  ["to-be or not to-be"]
 407 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 408 example3 =  example0 <> example0
 409        -- > TEST: Should not have York New in the trie
 410 example4 =  ["a-b-c-d e a-b-c-d f"]
 411 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 412 example6 =  ["le-petit chat"
 413             ,"le-petit chien"
 414             ,"le-petit rat"
 415             ,"le gros rat"
 416             ]
 417
 418 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 419
 420 checks0 = {-
 421   [("<start> New", 1, nan, nan, nan, nan, 0.0)
 422   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 423   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 424   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 425   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 426   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 427   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 428   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 429   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 430   ]-}
 431
 432   [("<start>", 1, nan, nan, nan, nan, 0.0)
 433   , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 434   , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 435   , ("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 436   , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 437   , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 438   , ("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
 439   , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
 440   , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
 441   , ("<stop>", 0, nan, nan, nan, 0.0, nan)]
 442   <>
 443
 444   [("<start> New", 1, nan, nan, nan, nan, 0.0)
 445   , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 446   , ("York is", 1, 0.0, nan, nan, nan, 0.0)
 447   , ("is New", 1, 0.0, nan, nan, nan, 0.0)
 448   , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 449   , ("York and", 1, 0.0, nan, nan, nan, 0.0)
 450   , ("and New", 1, 0.0, nan, nan, nan, 0.0)
 451   , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
 452   , ("York <stop>", 1, nan, nan, nan, nan, nan)]
 453   <>
 454   [("<start> New York", 1, nan, nan, nan, nan, 0.0)
 455   , ("New York is", 1, 0.0, nan, nan, nan, 0.0)
 456   , ("York is New", 1, 0.0, nan, nan, nan, 0.0)
 457   , ("is New York", 1, 0.0, nan, nan, nan, 0.0)
 458   , ("New York and", 1, 0.0, nan, nan, nan, 0.0)
 459   , ("York and New", 1, 0.0, nan, nan, nan, 0.0)
 460   , ("and New York", 1, 0.0, nan, nan, nan, 0.0)
 461   , ("New York <stop>", 1, nan, nan, nan, nan, nan)
 462   , ("York <stop>", 1, nan, nan, nan, nan, nan)
 463   , ("<stop>", 0, nan, nan, nan, 0.0, nan)
 464   , ("", 9, 2.113283334294875, nan, nan, 2.113283334294875, 2.113283334294875)]
 465
 466
 467
 468 checks2 =
 469   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 470   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 471   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 472   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 473   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 474   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 475   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 476   ]
 477
 478
 479 runTests :: IO ()
 480 runTests =
 481   forM_
 482     [("example0", 2, example0, checks0)
 483     ,("example1", 2, example1, [])
 484     ,("example2", 3, example2, checks2)
 485     ,("example3", 2, example3, [])
 486     ,("example4", 4, example4, [])
 487     ,("example5", 5, example5, [])
 488     ]
 489     (\(name, n, ex, checks) -> do
 490       P.putStrLn $ name <> " " <> show n
 491       b <- testEleve False n ex checks
 492       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 493     )