src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
  48 import Control.Monad (foldM, mapM_, forM_)
  49 import Data.Ord (Ord)
  50 import qualified Data.List as L
  51 import Data.Monoid
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import Data.Map (Map)
  55 import Data.Maybe (fromMaybe, catMaybes)
  56 import qualified Data.Map as Map
  57 import Gargantext.Prelude hiding (cs)
  58 import qualified Data.Tree as Tree
  59 import Data.Tree (Tree)
  60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  61
  62 type Entropy e =
  63   ( Fractional e
  64   , Floating e
  65   , P.RealFloat e
  66   , Show e
  67   -- ^ TODO: only used for debugging
  68   )
  69 ------------------------------------------------------------------------
  70 -- | Example and tests for development
  71 data I e = I
  72   { _info_entropy  :: e
  73   , _info_autonomy :: e
  74   }
  75
  76 instance Show e => Show (I e) where
  77   show (I e n) = show (e, n)
  78
  79 makeLenses ''I
  80
  81 type ModEntropy i o e = (e -> e) -> i -> o
  82
  83 set_autonomy :: ModEntropy e (I e) e
  84 set_autonomy f e = I e (f e)
  85
  86 data StartStop = Start | Stop
  87   deriving (Ord, Eq, Show)
  88
  89 data Token = NonTerminal Text
  90            | Terminal StartStop
  91   deriving (Ord, Eq, Show)
  92
  93 isTerminal :: Token -> Bool
  94 isTerminal (Terminal    _) = True
  95 isTerminal (NonTerminal _) = False
  96
  97 parseToken :: Text -> Token
  98 parseToken "<start>" = Terminal Start
  99 parseToken "<stop>"  = Terminal Stop
 100 parseToken t         = NonTerminal t
 101
 102 toToken :: [Text] -> [Token]
 103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
 104
 105 printToken :: Token -> Text
 106 printToken = f
 107   where
 108     f (NonTerminal x)  = x
 109     f (Terminal Start) = "<start>"
 110     f (Terminal Stop)  = "<stop>"
 111
 112 ------------------------------------------------------------------------
 113
 114 data Trie k e
 115   = Node { _node_count    :: Int
 116          , _node_entropy  :: e
 117          , _node_children :: Map k (Trie k e)
 118          }
 119  | Leaf { _node_count    :: Int }
 120   deriving (Show)
 121
 122 makeLenses ''Trie
 123
 124 insertTries :: Ord k => [[k]] -> Trie k ()
 125 insertTries = L.foldr insertTrie emptyTrie
 126
 127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 128 insertTrie []     n                    = n { _node_count = _node_count n +1}
 129 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 131   where
 132     f = Just . insertTrie xs . fromMaybe emptyTrie
 133
 134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 135 -- emptyTrie = Node 0 mempty mempty
 136 emptyTrie :: Trie k e
 137 emptyTrie  = Leaf 0
 138
 139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 140 mkTrie c children
 141   | Map.null children = Leaf c
 142   | otherwise         = Node c mempty children
 143
 144                         -----------------------------
 145
 146 -- | Trie to Tree since Tree as nice print function
 147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 148 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 149 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 150
 151 ------------------------------------------------------------------------
 152 ------------------------------------------------------------------------
 153
 154 nan :: Floating e => e
 155 nan = 0 / 0
 156
 157 updateIfDefined :: P.RealFloat e => e -> e -> e
 158 updateIfDefined e0 e | P.isNaN e = e0
 159                      | otherwise = e
 160
 161 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
 162 entropyTrie _    (Leaf c)             = Leaf c
 163 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 164   where
 165     e = sum $ map f $ Map.toList children
 166     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 167                              else - chc * P.logBase 2 chc
 168       where
 169         chc = fromIntegral (_node_count child) / fromIntegral c
 170 ------------------------------------------------------------------------
 171
 172 normalizeLevel :: Entropy e => [e] -> e -> e
 173 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
 174
 175   where
 176     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 177     checkDiff = identity
 178     go []  = panic "normalizeLevel: impossible"
 179                         -- trace "normalizeLevel"
 180 --    go [_] = identity
 181     go es  = \e -> (e - m) / v
 182 {-
 183                               in if P.isNaN e'
 184                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 185                                       e
 186                                   else e'
 187 -}
 188       where
 189         m  = mean      es
 190         v  = deviation es
 191
 192 {- Unused
 193
 194 nodeChildren :: Trie k e -> Map k (Trie k e)
 195 nodeChildren (Node _ _ cs) = cs
 196 nodeChildren (Leaf _)      = Map.empty
 197
 198 -}
 199
 200 class IsTrie trie where
 201   buildTrie :: Floating e => [[Token]] -> trie Token e
 202   nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
 203   nodeChild :: Ord k => k -> trie k e -> trie k e
 204   findTrie :: Ord k => [k] -> trie k e -> trie k e
 205   normalizeEntropy :: Entropy e
 206                    => Getting e i e -> ModEntropy i o e
 207                    -> trie k i -> trie k o
 208
 209   nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 210   nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 211
 212 instance IsTrie Trie where
 213   buildTrie = entropyTrie isTerminal . insertTries
 214
 215   nodeEntropy inE (Node _ e _) = e ^. inE
 216   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 217                                  nan
 218
 219   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 220   nodeChild _ (Leaf _)      = emptyTrie
 221
 222   findTrie ks t = L.foldl (flip nodeChild) t ks
 223
 224   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 225     where
 226       go _ []         _                   = panic "normalizeEntropy' empty levels"
 227       go _ _          (Leaf c)            = Leaf c
 228       go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 229       go f (es : ess) (Node c i children) =
 230           Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 231
 232
 233   {-
 234   This is only normalizing a node with respect to its brothers (unlike all the
 235   nodes of the same level).
 236
 237   normalizeEntropy inE modE = go $ modE identity
 238     where
 239       go _ (Leaf c) = Leaf c
 240       go f (Node c i children)
 241         | Map.null children =
 242             panic "normalizeEntropy: impossible"
 243         | otherwise         =
 244             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 245           where
 246             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 247   -}
 248 ------------------------------------------------------------------------
 249
 250 levels :: Trie k e -> [[Trie k e]]
 251 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 252   where
 253     subForest :: Trie k e -> [Trie k e]
 254     subForest (Leaf _)            = []
 255     subForest (Node _ _ children) = Map.elems children
 256
 257 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 258 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
 259
 260 ------------------------------------------------------------------------
 261
 262 data Tries k e = Tries
 263   { _fwd :: Trie k e
 264   , _bwd :: Trie k e
 265   }
 266
 267 instance IsTrie Tries where
 268   buildTrie tts = Tries { _fwd = buildTrie tts
 269                         , _bwd = buildTrie (reverse <$> tts)
 270                         }
 271
 272   nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
 273
 274   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
 275
 276   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 277
 278   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 279
 280 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 281 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 282
 283 ------------------------------------------------------------------------
 284 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 285 split _   _  [] = []
 286 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 287 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 288   where
 289     consRev [] xss = xss
 290     consRev xs xss = reverse xs : xss
 291
 292     go _ pref []                  = [reverse pref]
 293     go _ pref (Terminal Stop:_)   = [reverse pref]
 294     go t pref (Terminal Start:xs) = go t pref xs
 295     go t pref (x:xs) =
 296         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 297         if acc
 298           then go xt (x:pref) xs
 299           else consRev pref $ go xt0 [x] xs
 300       where
 301         xt   = nodeChild x t
 302         xt0  = nodeChild x t0
 303         et   = ne 0 t
 304     --  ^ entropy of the current prefix
 305         ext0 = ne 0 xt0
 306     --  ^ entropy of [x]
 307         ext  = ne 0 xt
 308     --  ^ entropy of the current prefix plus x
 309         acc  = ext > et + ext0
 310         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 311
 312     ne d t = if P.isNaN e then d else e
 313       where e = nodeEntropy inE t
 314
 315 {-
 316 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 317 split inE t0 ts =
 318   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 319 -}
 320
 321 ------------------------------------------------------------------------
 322 ------------------------------------------------------------------------
 323
 324 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 325 mainEleve _ _ = []
 326 {-
 327 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
 328   where
 329     inp = toToken <$> input
 330     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 331 -}
 332
 333 sim :: Entropy e => e -> e -> Bool
 334 sim x y = x == y || (P.isNaN x && P.isNaN y)
 335
 336 chunkAlongEleve :: Int -> [a] -> [[a]]
 337 chunkAlongEleve n xs = L.take n <$> L.tails xs
 338
 339 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 340 testEleve debug n output checks = do
 341   let
 342     {-
 343     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 344           | ps <- L.nub $ [ c
 345                           | m <- [1..n]
 346                           , cs <- chunkAlong m 1 <$> inp
 347                           , c <- cs
 348                           ]
 349           ]
 350     -}
 351   --res = map (map printToken) . split identity fwd <$> inp
 352   --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
 353     res = map (map printToken) . split info_autonomy nt <$> inp
 354   when debug $ do
 355     P.putStrLn (show input)
 356     -- mapM_ (P.putStrLn . show) pss
 357     P.putStrLn ""
 358 --    printTrie nt
 359     printTrie (_fwd nt)
 360     printTrie (_bwd nt)
 361     P.putStrLn $ show res
 362   forM_ checks checker
 363   pure $ expected == res
 364
 365   where
 366     out = T.words <$> output
 367     expected = fmap (T.splitOn "-") <$> out
 368     input = (T.splitOn "-" =<<) <$> out
 369     inp = toToken <$> input
 370     t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
 371     -- nt = normalizeEntropy  identity set_autonomy (fwd :: Trie Token Double)
 372     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 373     nt = normalizeEntropy identity set_autonomy t
 374
 375     check f msg x y =
 376       if f x y
 377         then P.putStrLn $ "    PASS " <> msg <> " " <> show x <> " ~= " <> show y
 378         else P.putStrLn $ "    FAIL " <> msg <> " " <> show x <> " /= " <> show y
 379
 380     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 381       let ns = parseToken <$> T.words ngram
 382           t' = findTrie ns nt
 383       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 384       check (==) "count"       count       (_node_count (_fwd t'))
 385       check sim  "entropy"     entropy     (nodeEntropy info_entropy t')
 386       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 387       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 388       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
 389
 390     printTrie =
 391       P.putStrLn . Tree.drawTree
 392                  . fmap show
 393                  . toTree (NonTerminal "")
 394
 395 -- | TODO real data is a list of tokenized sentences
 396 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 397 example0 =  ["New-York is New-York and New-York"]
 398 example1 =  ["to-be or not to-be"]
 399 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 400 example3 =  example0 <> example0
 401        -- > TEST: Should not have York New in the trie
 402 example4 =  ["a-b-c-d e a-b-c-d f"]
 403 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 404 example6 =  ["le-petit chat"
 405             ,"le-petit chien"
 406             ,"le-petit rat"
 407             ,"le gros rat"
 408             ]
 409
 410 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 411
 412 checks0 =
 413   [("<start> New", 1, nan, nan, nan, nan, 0.0)
 414   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 415   ,("York is", 1, 0.0, nan, nan, nan, 0.0)
 416   ,("is New", 1, 0.0, nan, nan, nan, 0.0)
 417   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 418   ,("York and", 1, 0.0, nan, nan, nan, 0.0)
 419   ,("and New", 1, 0.0, nan, nan, nan, 0.0)
 420   ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
 421   ,("York <stop>", 1, nan, nan, nan, nan, nan)
 422   ]
 423
 424 checks2 =
 425   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 426   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 427   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 428   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 429   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 430   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 431   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 432   ]
 433
 434
 435 runTests :: IO ()
 436 runTests =
 437   forM_
 438     [("example0", 2, example0, checks0)
 439     ,("example1", 2, example1, [])
 440     ,("example2", 3, example2, checks2)
 441     ,("example3", 2, example3, [])
 442     ,("example4", 4, example4, [])
 443     ,("example5", 5, example5, [])
 444     ]
 445     (\(name, n, ex, checks) -> do
 446       P.putStrLn $ name <> " " <> show n
 447       b <- testEleve False n ex checks
 448       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 449     )