src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40 {-# LANGUAGE TypeFamilies      #-}
  41
  42 module Gargantext.Text.Eleve where
  43
  44 import Debug.Trace (trace)
  45 -- import Debug.SimpleReflect
  46
  47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
  48 import Control.Monad (foldM, mapM_, forM_)
  49 import Data.Ord (Ord)
  50 import qualified Data.List as L
  51 import Data.Monoid
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import Data.Map (Map)
  55 import Data.Maybe (fromMaybe, catMaybes)
  56 import qualified Data.Map as Map
  57 import Gargantext.Prelude hiding (cs)
  58 import qualified Data.Tree as Tree
  59 import Data.Tree (Tree)
  60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  61
  62 type Entropy e =
  63   ( Fractional e
  64   , Floating e
  65   , P.RealFloat e
  66   , Show e
  67   -- ^ TODO: only used for debugging
  68   )
  69 ------------------------------------------------------------------------
  70 -- | Example and tests for development
  71 data I e = I
  72   { _info_entropy  :: e
  73   , _info_autonomy :: e
  74   }
  75
  76 instance Show e => Show (I e) where
  77   show (I e n) = show (e, n)
  78
  79 makeLenses ''I
  80
  81 type ModEntropy i o e = (e -> e) -> i -> o
  82
  83 setNormEntropy :: ModEntropy e (I e) e
  84 setNormEntropy f e = I e (f e)
  85
  86 data StartStop = Start | Stop
  87   deriving (Ord, Eq, Show)
  88
  89 data Token = NonTerminal Text
  90            | Terminal StartStop
  91   deriving (Ord, Eq, Show)
  92
  93 isTerminal :: Token -> Bool
  94 isTerminal (Terminal    _) = True
  95 isTerminal (NonTerminal _) = False
  96
  97 toToken :: Int -> [Text] -> [Token]
  98 toToken n xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
  99
 100 unToken :: [Token] -> [Text]
 101 unToken = map f
 102   where
 103     f (NonTerminal x) = x
 104     f (Terminal _)    = ""
 105
 106 ------------------------------------------------------------------------
 107
 108 data Trie k e
 109   = Node { _node_count    :: Int
 110          , _node_entropy  :: e
 111          , _node_children :: Map k (Trie k e)
 112          }
 113  | Leaf { _node_count    :: Int }
 114   deriving (Show)
 115
 116 makeLenses ''Trie
 117
 118 insertTries :: Ord k => [[k]] -> Trie k ()
 119 insertTries = L.foldr insertTrie emptyTrie
 120
 121 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 122 insertTrie []     n                    = n { _node_count = _node_count n +1}
 123 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 124 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 125   where
 126     f = Just . insertTrie xs . fromMaybe emptyTrie
 127
 128 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 129 -- emptyTrie = Node 0 mempty mempty
 130 emptyTrie :: Trie k e
 131 emptyTrie  = Leaf 0
 132
 133 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 134 mkTrie c children
 135   | Map.null children = Leaf c
 136   | otherwise         = Node c mempty children
 137
 138                         -----------------------------
 139
 140 -- | Trie to Tree since Tree as nice print function
 141 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 142 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 143 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 144
 145 ------------------------------------------------------------------------
 146 ------------------------------------------------------------------------
 147
 148 nan :: Floating e => e
 149 nan = 0 / 0
 150
 151 updateIfDefined :: P.RealFloat e => e -> e -> e
 152 updateIfDefined e0 e | P.isNaN e = e0
 153                      | otherwise = e
 154
 155 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
 156 entropyTrie _    (Leaf c)             = Leaf c
 157 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 158   where
 159     e = sum $ map f $ Map.toList children
 160     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 161                              else - chc * P.logBase 2 chc
 162       where
 163         chc = fromIntegral (_node_count child) / fromIntegral c
 164 ------------------------------------------------------------------------
 165
 166 normalizeLevel :: Entropy e => [e] -> e -> e
 167 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
 168
 169   where
 170     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 171     checkDiff = identity
 172     go []  = panic "normalizeLevel: impossible"
 173                         -- trace "normalizeLevel"
 174 --    go [_] = identity
 175     go es  = \e -> (e - m) / v
 176 {-
 177                               in if P.isNaN e'
 178                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 179                                       e
 180                                   else e'
 181 -}
 182       where
 183         m  = mean      es
 184         v  = deviation es
 185
 186 {- Unused
 187
 188 nodeChildren :: Trie k e -> Map k (Trie k e)
 189 nodeChildren (Node _ _ cs) = cs
 190 nodeChildren (Leaf _)      = Map.empty
 191
 192 -}
 193
 194 class IsTrie trie where
 195   buildTrie :: Floating e => [[Token]] -> trie Token e
 196   nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
 197   nodeChild :: Ord k => k -> trie k e -> trie k e
 198   findTrie :: Ord k => [k] -> trie k e -> trie k e
 199   normalizeEntropy :: Entropy e
 200                    => Getting e i e -> ModEntropy i o e
 201                    -> trie k i -> trie k o
 202
 203   nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 204   nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 205
 206 instance IsTrie Trie where
 207   buildTrie = entropyTrie isTerminal . insertTries
 208
 209   nodeEntropy inE (Node _ e _) = e ^. inE
 210   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 211                                  nan
 212
 213   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 214   nodeChild _ (Leaf _)      = emptyTrie
 215
 216   findTrie ks t = L.foldl (flip nodeChild) t ks
 217
 218   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 219     where
 220       go _ []         _                   = panic "normalizeEntropy' empty levels"
 221       go _ _          (Leaf c)            = Leaf c
 222       go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 223       go f (es : ess) (Node c i children) =
 224           Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 225
 226
 227   {-
 228   This is only normalizing a node with respect to its brothers (unlike all the
 229   nodes of the same level).
 230
 231   normalizeEntropy inE modE = go $ modE identity
 232     where
 233       go _ (Leaf c) = Leaf c
 234       go f (Node c i children)
 235         | Map.null children =
 236             panic "normalizeEntropy: impossible"
 237         | otherwise         =
 238             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 239           where
 240             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 241   -}
 242 ------------------------------------------------------------------------
 243
 244 levels :: Trie k e -> [[Trie k e]]
 245 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 246   where
 247     subForest :: Trie k e -> [Trie k e]
 248     subForest (Leaf _)            = []
 249     subForest (Node _ _ children) = Map.elems children
 250
 251 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 252 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
 253
 254 ------------------------------------------------------------------------
 255
 256 data Tries k e = Tries
 257   { _fwd :: Trie k e
 258   , _bwd :: Trie k e
 259   }
 260
 261 instance IsTrie Tries where
 262   buildTrie tts = Tries { _fwd = buildTrie tts
 263                         , _bwd = buildTrie (reverse <$> tts)
 264                         }
 265
 266   nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
 267
 268   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
 269
 270   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 271
 272   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 273
 274 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 275 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 276
 277 ------------------------------------------------------------------------
 278 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 279 split _   _  [] = []
 280 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 281 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 282   where
 283     consRev [] xss = xss
 284     consRev xs xss = reverse xs : xss
 285
 286     go _ pref []                  = [reverse pref]
 287     go _ pref (Terminal Stop:_)   = [reverse pref]
 288     go t pref (Terminal Start:xs) = go t pref xs
 289     go t pref (x:xs) =
 290         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 291         if acc
 292           then go xt (x:pref) xs
 293           else consRev pref $ go xt0 [x] xs
 294       where
 295         xt   = nodeChild x t
 296         xt0  = nodeChild x t0
 297         et   = ne 0 t
 298     --  ^ entropy of the current prefix
 299         ext0 = ne 0 xt0
 300     --  ^ entropy of [x]
 301         ext  = ne 0 xt
 302     --  ^ entropy of the current prefix plus x
 303         acc  = ext > et + ext0
 304         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 305
 306     ne d t = if P.isNaN e then d else e
 307       where e = nodeEntropy inE t
 308
 309 {-
 310 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 311 split inE t0 ts =
 312   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 313 -}
 314
 315 ------------------------------------------------------------------------
 316 ------------------------------------------------------------------------
 317
 318 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 319 mainEleve _ _ = []
 320 {-
 321 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
 322   where
 323     inp = toToken (n - 1) <$> input
 324     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 325 -}
 326
 327 sim :: Entropy e => e -> e -> Bool
 328 sim x y = x == y || (P.isNaN x && P.isNaN y)
 329
 330 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
 331 testEleve debug n output checks = do
 332   let
 333     {-
 334     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 335           | ps <- L.nub $ [ c
 336                           | m <- [1..n]
 337                           , cs <- chunkAlong m 1 <$> inp
 338                           , c <- cs
 339                           ]
 340           ]
 341     -}
 342   --res = map unToken . split identity fwd <$> inp
 343   --res = map unToken . split info_norm_entropy' nt' <$> inp
 344     res = map unToken . split info_autonomy nt <$> inp
 345   when debug $ do
 346     P.putStrLn (show input)
 347     -- mapM_ (P.putStrLn . show) pss
 348     P.putStrLn ""
 349 --    printTrie nt
 350     printTrie (_fwd nt)
 351     printTrie (_bwd nt)
 352     P.putStrLn $ show res
 353   forM_ checks checker
 354   pure $ expected == res
 355
 356   where
 357     out = T.words <$> output
 358     expected = fmap (T.splitOn "-") <$> out
 359     input = (T.splitOn "-" =<<) <$> out
 360     inp = toToken (n - 1) <$> input
 361     t = buildTrie $ L.concat $ chunkAlong (n + 1) 1 <$> inp
 362     -- nt = normalizeEntropy  identity setNormEntropy (fwd :: Trie Token Double)
 363     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 364     nt = normalizeEntropy identity setNormEntropy t
 365
 366     check f msg x y =
 367       if f x y
 368         then P.putStrLn $ "    PASS " <> msg <> " " <> show x <> " ~= " <> show y
 369         else P.putStrLn $ "    FAIL " <> msg <> " " <> show x <> " /= " <> show y
 370
 371     checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
 372       let ns = NonTerminal <$> T.words ngram
 373           t' = findTrie ns nt
 374       P.putStrLn $ "  " <> T.unpack ngram <> ":"
 375       check (==) "count"       count       (_node_count (_fwd t'))
 376       check sim  "entropy"     entropy     (nodeEntropy info_entropy t')
 377       check sim  "autonomy"    autonomy    (nodeEntropy info_autonomy t')
 378       check sim  "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
 379       check sim  "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
 380
 381     printTrie =
 382       P.putStrLn . Tree.drawTree
 383                  . fmap show
 384                  . toTree (NonTerminal "")
 385
 386 -- | TODO real data is a list of tokenized sentences
 387 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 388 example0 =  ["New-York is New-York and New-York"]
 389 example1 =  ["to-be or not to-be"]
 390 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 391 example3 =  example0 <> example0
 392        -- > TEST: Should not have York New in the trie
 393 example4 =  ["a-b-c-d e a-b-c-d f"]
 394 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 395 example6 =  ["le-petit chat"
 396             ,"le-petit chien"
 397             ,"le-petit rat"
 398             ,"le gros rat"
 399             ]
 400
 401 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
 402
 403 checks0 = [("\ue02b New", 1, nan, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York is", 1, 0.0, nan, nan, nan, 0.0), ("is New", 1, 0.0, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York and", 1, 0.0, nan, nan, nan, 0.0), ("and New", 1, 0.0, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York \ue02d", 1, nan, nan, nan, nan, nan)]
 404
 405 checks2 =
 406   [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
 407   ,("be or",  2, 0.5, nan, nan, nan, 1.0)
 408   ,("or not", 1, 0.0, nan, nan, nan, 0.0)
 409   ,("not to", 1, 0.0, nan, nan, nan, 0.0)
 410   ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
 411   ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
 412   ,("be and", 1, 0.0, nan, nan, nan, 0.0)
 413   ]
 414
 415
 416 runTests :: IO ()
 417 runTests =
 418   forM_
 419     [("example0", 2, example0, checks0)
 420     ,("example1", 2, example1, [])
 421     ,("example2", 3, example2, checks2)
 422     ,("example3", 2, example3, [])
 423     ,("example4", 4, example4, [])
 424     ,("example5", 5, example5, [])
 425     ]
 426     (\(name, n, ex, checks) -> do
 427       P.putStrLn $ name <> " " <> show n
 428       b <- testEleve False n ex checks
 429       P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
 430     )