src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
  48 import Control.Monad (foldM, mapM_, forM_)
  49 import Data.Ord (Ord)
  50 import qualified Data.List as L
  51 import Data.Monoid
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import Data.Map (Map)
  55 import Data.Maybe (fromMaybe, catMaybes)
  56 import qualified Data.Map as Map
  57 import Gargantext.Prelude hiding (cs)
  58 import qualified Data.Tree as Tree
  59 import Data.Tree (Tree)
  60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  61
  62 type Entropy e =
  63   ( Fractional e
  64   , Floating e
  65   , P.RealFloat e
  66   , Show e
  67   -- ^ TODO: only used for debugging
  68   )
  69 ------------------------------------------------------------------------
  70 -- | Example and tests for development
  71 data I e = I
  72   { _info_entropy  :: e
  73   , _info_autonomy :: e
  74   }
  75
  76 instance Show e => Show (I e) where
  77   show (I e n) = show (e, n)
  78
  79 makeLenses ''I
  80
  81 type ModEntropy i o e = (e -> e) -> i -> o
  82
  83 set_autonomy :: ModEntropy e (I e) e
  84 set_autonomy f e = I e (f e)
  85
  86 data StartStop = Start | Stop
  87   deriving (Ord, Eq, Show)
  88
  89 data Token = NonTerminal Text
  90            | Terminal StartStop
  91   deriving (Ord, Eq, Show)
  92
  93 isTerminal :: Token -> Bool
  94 isTerminal (Terminal    _) = True
  95 isTerminal (NonTerminal _) = False
  96
  97 parseToken :: Text -> Token
  98 parseToken "<start>" = Terminal Start
  99 parseToken "<stop>"  = Terminal Stop
 100 parseToken t         = NonTerminal t
 101
 102 toToken :: [Text] -> [Token]
 103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 104
 105 printToken :: Token -> Text
 106 printToken = f
 107   where
 108     f (NonTerminal x)  = x
 109     f (Terminal Start) = "<start>"
 110     f (Terminal Stop)  = "<stop>"
 111
 112 ------------------------------------------------------------------------
 113
 114 data Trie k e
 115   = Node { _node_count    :: Int
 116          , _node_entropy  :: e
 117          , _node_children :: Map k (Trie k e)
 118          }
 119  | Leaf { _node_count    :: Int }
 120   deriving (Show)
 121
 122 makeLenses ''Trie
 123
 124 insertTries :: Ord k => [[k]] -> Trie k ()
 125 insertTries = L.foldr insertTrie emptyTrie
 126
 127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 128 insertTrie []     n                    = n { _node_count = _node_count n +1}
 129 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 131   where
 132     f = Just . insertTrie xs . fromMaybe emptyTrie
 133
 134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 135 -- emptyTrie = Node 0 mempty mempty
 136 emptyTrie :: Trie k e
 137 emptyTrie  = Leaf 0
 138
 139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 140 mkTrie c children
 141   | Map.null children = Leaf c
 142   | otherwise         = Node c mempty children
 143
 144                         -----------------------------
 145
 146 -- | Trie to Tree since Tree as nice print function
 147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 148 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 149 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 150
 151 ------------------------------------------------------------------------
 152 ------------------------------------------------------------------------
 153
 154 nan :: Floating e => e
 155 nan = 0 / 0
 156
 157 noNaNs :: P.RealFloat e => [e] -> [e]
 158 noNaNs = filter (not . P.isNaN)
 159
 160 updateIfDefined :: P.RealFloat e => e -> e -> e
 161 updateIfDefined e0 e | P.isNaN e = e0
 162                      | otherwise = e
 163
 164 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
 165 entropyTrie _    (Leaf c)             = Leaf c
 166 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 167   where
 168     e = sum $ map f $ Map.toList children
 169     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 170                              else - chc * P.logBase 2 chc
 171       where
 172         chc = fromIntegral (_node_count child) / fromIntegral c
 173 ------------------------------------------------------------------------
 174
 175 normalizeLevel :: Entropy e => [e] -> e -> e
 176 normalizeLevel = checkDiff (go . noNaNs)
 177
 178   where
 179     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 180     checkDiff = identity
 181     go []  = panic "normalizeLevel: impossible"
 182                         -- trace "normalizeLevel"
 183 --    go [_] = identity
 184     go es  = \e -> (e - m) / v
 185 {-
 186                               in if P.isNaN e'
 187                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 188                                       e
 189                                   else e'
 190 -}
 191       where
 192         m  = mean      es
 193         v  = deviation es
 194
 195 {- Unused
 196
 197 nodeChildren :: Trie k e -> Map k (Trie k e)
 198 nodeChildren (Node _ _ cs) = cs
 199 nodeChildren (Leaf _)      = Map.empty
 200
 201 -}
 202
 203 class IsTrie trie where
 204   buildTrie :: Floating e => [[Token]] -> trie Token e
 205   nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
 206   nodeChild :: Ord k => k -> trie k e -> trie k e
 207   findTrie :: Ord k => [k] -> trie k e -> trie k e
 208   normalizeEntropy :: Entropy e
 209                    => Getting e i e -> ModEntropy i o e
 210                    -> trie k i -> trie k o
 211
 212   nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 213   nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 214
 215 instance IsTrie Trie where
 216   buildTrie = entropyTrie isTerminal . insertTries
 217
 218   nodeEntropy inE (Node _ e _) = e ^. inE
 219   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 220                                  nan
 221
 222   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 223   nodeChild _ (Leaf _)      = emptyTrie
 224
 225   findTrie ks t = L.foldl (flip nodeChild) t ks
 226
 227   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 228     where
 229       go _ []         _                   = panic "normalizeEntropy' empty levels"
 230       go _ _          (Leaf c)            = Leaf c
 231       go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 232       go f (es : ess) (Node c i children) =
 233           Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 234
 235
 236   {-
 237   This is only normalizing a node with respect to its brothers (unlike all the
 238   nodes of the same level).
 239
 240   normalizeEntropy inE modE = go $ modE identity
 241     where
 242       go _ (Leaf c) = Leaf c
 243       go f (Node c i children)
 244         | Map.null children =
 245             panic "normalizeEntropy: impossible"
 246         | otherwise         =
 247             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 248           where
 249             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 250   -}
 251 ------------------------------------------------------------------------
 252
 253 levels :: Trie k e -> [[Trie k e]]
 254 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 255   where
 256     subForest :: Trie k e -> [Trie k e]
 257     subForest (Leaf _)            = []
 258     subForest (Node _ _ children) = Map.elems children
 259
 260 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 261 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
 262
 263 ------------------------------------------------------------------------
 264
 265 data Tries k e = Tries
 266   { _fwd :: Trie k e
 267   , _bwd :: Trie k e
 268   }
 269
 270 instance IsTrie Tries where
 271   buildTrie tts = Tries { _fwd = buildTrie tts
 272                         , _bwd = buildTrie (reverse <$> tts)
 273                         }
 274
 275   nodeEntropy inE (Tries fwd bwd) =
 276     mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
 277
 278   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
 279
 280   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 281
 282   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 283
 284 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 285 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 286
 287 ------------------------------------------------------------------------
 288 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 289 split _   _  [] = []
 290 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 291 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 292   where
 293     consRev [] xss = xss
 294     consRev xs xss = reverse xs : xss
 295
 296     go _ pref []                  = [reverse pref]
 297     go _ pref (Terminal Stop:_)   = [reverse pref]
 298     go t pref (Terminal Start:xs) = go t pref xs
 299     go t pref (x:xs) =
 300         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 301         if acc
 302           then go xt (x:pref) xs
 303           else consRev pref $ go xt0 [x] xs
 304       where
 305         xt   = nodeChild x t
 306         xt0  = nodeChild x t0
 307         et   = ne 0 t
 308     --  ^ entropy of the current prefix
 309         ext0 = ne 0 xt0
 310     --  ^ entropy of [x]
 311         ext  = ne 0 xt
 312     --  ^ entropy of the current prefix plus x
 313         acc  = ext > et + ext0
 314         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 315
 316     ne d t = if P.isNaN e then d else e
 317       where e = nodeEntropy inE t
 318
 319 {-
 320 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 321 split inE t0 ts =
 322   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 323 -}
 324
 325 ------------------------------------------------------------------------
 326 ------------------------------------------------------------------------
 327
 328 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 329 mainEleve _ _ = []
 330 {-
 331 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 332   where
 333     inp = toToken <$> input
 334     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 335 -}
 336
 337 sim :: Entropy e => e -> e -> Bool
 338 sim x y = x == y || (P.isNaN x && P.isNaN y)
 339
 340 chunkAlongEleve :: Int -> [a] -> [[a]]
 341 chunkAlongEleve n xs = L.take n <$> L.tails xs
 342
 343 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 344 testEleve debug n output checks = do
 345   let
 346     {-
 347     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 348           | ps <- L.nub $ [ c
 349                           | m <- [1..n]
 350                           , cs <- chunkAlong m 1 <$> inp
 351                           , c <- cs
 352                           ]
 353           ]
 354     -}
 355   --res = map (map printToken) . split identity fwd <$> inp
 356   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 357     res = map (map printToken) . split info_autonomy nt <$> inp
 358   when debug $ do
 359     P.putStrLn (show input)
 360     -- mapM_ (P.putStrLn . show) pss
 361     P.putStrLn ""
 362 --    printTrie nt
 363     printTrie (_fwd nt)
 364     printTrie (_bwd nt)
 365     P.putStrLn $ show res
 366   forM_ checks checker
 367   pure $ expected == res
 368
 369   where
 370     out = T.words <$> output
 371     expected = fmap (T.splitOn "-") <$> out
 372     input = (T.splitOn "-" =<<) <$> out
 373     inp = toToken <$> input
 374     t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
 375     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 376     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 377     nt = normalizeEntropy identity set_autonomy t
 378
 379     check f msg x y =
 380       if f x y
 381         then P.putStrLn $ "    PASS " <> msg <> " " <> show x <> " ~= " <> show y
 382         else P.putStrLn $ "    FAIL " <> msg <> " " <> show x <> " /= " <> show y
 383
 384     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 385       let ns = parseToken <$> T.words ngram
 386           t' = findTrie ns nt
 387       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 388       check (==) "count"       count       (_node_count (_fwd t'))
 389       check sim  "entropy"     entropy     (nodeEntropy info_entropy t')
 390       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 391       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 392       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
 393
 394     printTrie =
 395       P.putStrLn . Tree.drawTree
 396                  . fmap show
 397                  . toTree (NonTerminal "")
 398
 399 -- | TODO real data is a list of tokenized sentences
 400 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 401 example0 =  ["New-York is New-York and New-York"]
 402 example1 =  ["to-be or not to-be"]
 403 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 404 example3 =  example0 <> example0
 405        -- > TEST: Should not have York New in the trie
 406 example4 =  ["a-b-c-d e a-b-c-d f"]
 407 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 408 example6 =  ["le-petit chat"
 409             ,"le-petit chien"
 410             ,"le-petit rat"
 411             ,"le gros rat"
 412             ]
 413
 414 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 415
 416 checks0 =
 417   [("<start> New", 1, nan, nan, nan, nan, 0.0)
 418   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 419   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 420   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 421   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 422   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 423   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 424   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 425   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 426   ]
 427
 428 checks2 =
 429   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 430   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 431   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 432   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 433   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 434   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 435   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 436   ]
 437
 438
 439 runTests :: IO ()
 440 runTests =
 441   forM_
 442     [("example0", 2, example0, checks0)
 443     ,("example1", 2, example1, [])
 444     ,("example2", 3, example2, checks2)
 445     ,("example3", 2, example3, [])
 446     ,("example4", 4, example4, [])
 447     ,("example5", 5, example5, [])
 448     ]
 449     (\(name, n, ex, checks) -> do
 450       P.putStrLn $ name <> " " <> show n
 451       b <- testEleve False n ex checks
 452       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 453     )