src/Gargantext/Text/Eleve.hs

   1 {-|
   2 Module      : Gargantext.Text.Eleve
   3 Description : Unsupervized Word segmentation
   4 Copyright   : (c) CNRS, 2019-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 # Implementation of Unsupervized Word Segmentation
  11
  12 References:
  13
  14 - Python implementation (Korantin August, Emmanuel Navarro):
  15   [EleVe](https://github.com/kodexlab/eleve.git)
  16
  17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  18   Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  19   the 50th Annual Meeting of the Association for Computational Linguistics
  20   , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
  21
  22 Notes for current implementation:
  23 - TODO fix normalization
  24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
  25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
  26
  27 - AD: Real ngrams extraction test
  28   from Gargantext.Text.Terms import extractTermsUnsupervised
  29   docs <- runCmdRepl $ selectDocs 1004
  30   extractTermsUnsupervised 3 $ DT.intercalate " "
  31                         $ catMaybes
  32                         $ Gargantext.map _hyperdataDocument_abstract docs
  33
  34 -}
  35 {-# LANGUAGE ConstraintKinds   #-}
  36 {-# LANGUAGE NoImplicitPrelude #-}
  37 {-# LANGUAGE OverloadedStrings #-}
  38 {-# LANGUAGE RankNTypes        #-}
  39 {-# LANGUAGE TemplateHaskell   #-}
  40
  41 module Gargantext.Text.Eleve where
  42
  43 import Debug.Trace (trace)
  44 -- import Debug.SimpleReflect
  45
  46 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
  47 import Control.Monad (foldM, mapM_, forM_)
  48 import Data.Ord (Ord)
  49 import qualified Data.List as L
  50 import Data.Monoid
  51 import Data.Text (Text)
  52 import qualified Data.Text as T
  53 import Data.Map (Map)
  54 import Data.Maybe (fromMaybe, catMaybes)
  55 import qualified Data.Map as Map
  56 import Gargantext.Prelude hiding (cs)
  57 import qualified Data.Tree as Tree
  58 import Data.Tree (Tree)
  59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
  60
  61 type Entropy e =
  62   ( Fractional e
  63   , Floating e
  64   , P.RealFloat e
  65   , Show e
  66   -- ^ TODO: only used for debugging
  67   )
  68 ------------------------------------------------------------------------
  69 -- | Example and tests for development
  70 data I e = I
  71   { _info_entropy       :: e
  72   , _info_norm_entropy  :: e
  73   }
  74
  75 instance Show e => Show (I e) where
  76   show (I e n) = show (e, n)
  77
  78 makeLenses ''I
  79
  80 type ModEntropy i o e = (e -> e) -> i -> o
  81
  82 setNormEntropy :: ModEntropy e (I e) e
  83 setNormEntropy f e = I e (f e)
  84
  85 data StartStop = Start | Stop
  86   deriving (Ord, Eq, Show)
  87
  88 data Token = NonTerminal Text
  89            | Terminal StartStop
  90   deriving (Ord, Eq, Show)
  91
  92 isTerminal :: Token -> Bool
  93 isTerminal (Terminal    _) = True
  94 isTerminal (NonTerminal _) = False
  95
  96 toToken :: Int -> [Text] -> [Token]
  97 toToken n xs = Terminal Start : (NonTerminal <$> xs) <> L.take n (repeat $ Terminal Stop)
  98
  99 unToken :: [Token] -> [Text]
 100 unToken = map f
 101   where
 102     f (NonTerminal x) = x
 103     f (Terminal _)    = ""
 104
 105 ------------------------------------------------------------------------
 106
 107 data Trie k e
 108   = Node { _node_count    :: Int
 109          , _node_entropy  :: e
 110          , _node_children :: Map k (Trie k e)
 111          }
 112  | Leaf { _node_count    :: Int }
 113   deriving (Show)
 114
 115 makeLenses ''Trie
 116
 117 insertTries :: Ord k => [[k]] -> Trie k ()
 118 insertTries = L.foldr insertTrie emptyTrie
 119
 120 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 121 insertTrie []     n                    = n { _node_count = _node_count n +1}
 122 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
 123 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
 124   where
 125     f = Just . insertTrie xs . fromMaybe emptyTrie
 126
 127 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 128 -- emptyTrie = Node 0 mempty mempty
 129 emptyTrie :: Trie k e
 130 emptyTrie  = Leaf 0
 131
 132 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 133 mkTrie c children
 134   | Map.null children = Leaf c
 135   | otherwise         = Node c mempty children
 136
 137                         -----------------------------
 138
 139 -- | Trie to Tree since Tree as nice print function
 140 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 141 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
 142 toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
 143
 144 ------------------------------------------------------------------------
 145 ------------------------------------------------------------------------
 146
 147 nan :: Floating e => e
 148 nan = 0 / 0
 149
 150 updateIfDefined :: P.RealFloat e => e -> e -> e
 151 updateIfDefined e0 e | P.isNaN e = e0
 152                      | otherwise = e
 153
 154 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
 155 entropyTrie _    (Leaf c)             = Leaf c
 156 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
 157   where
 158     e = sum $ map f $ Map.toList children
 159     f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
 160                              else - chc * P.logBase 2 chc
 161       where
 162         chc = fromIntegral (_node_count child) / fromIntegral c
 163 ------------------------------------------------------------------------
 164
 165 normalizeLevel :: Entropy e => [e] -> e -> e
 166 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
 167
 168   where
 169     -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
 170     checkDiff = identity
 171     go []  = panic "normalizeLevel: impossible"
 172                         -- trace "normalizeLevel"
 173 --    go [_] = identity
 174     go es  = \e -> (e - m) / v
 175 {-
 176                               in if P.isNaN e'
 177                                   then trace ("normalizeLevel " <> show (e,m,v,es))
 178                                       e
 179                                   else e'
 180 -}
 181       where
 182         m  = mean      es
 183         v  = deviation es
 184
 185 {- Unused
 186
 187 nodeChildren :: Trie k e -> Map k (Trie k e)
 188 nodeChildren (Node _ _ cs) = cs
 189 nodeChildren (Leaf _)      = Map.empty
 190
 191 -}
 192
 193 class IsTrie trie where
 194   buildTrie :: Floating e => [[Token]] -> trie Token e
 195   nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
 196   nodeChild :: Ord k => k -> trie k e -> trie k e
 197   findTrie :: Ord k => [k] -> trie k e -> trie k e
 198   normalizeEntropy :: Entropy e
 199                    => Getting e i e -> ModEntropy i o e
 200                    -> trie k i -> trie k o
 201
 202   nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
 203   nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
 204
 205 instance IsTrie Trie where
 206   buildTrie = entropyTrie isTerminal . insertTries
 207
 208   nodeEntropy inE (Node _ e _) = e ^. inE
 209   nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
 210                                  nan
 211
 212   nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
 213   nodeChild _ (Leaf _)      = emptyTrie
 214
 215   findTrie ks t = L.foldl (flip nodeChild) t ks
 216
 217   normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
 218     where
 219       go _ []         _                   = panic "normalizeEntropy' empty levels"
 220       go _ _          (Leaf c)            = Leaf c
 221       go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
 222       go f (es : ess) (Node c i children) =
 223           Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
 224
 225
 226   {-
 227   This is only normalizing a node with respect to its brothers (unlike all the
 228   nodes of the same level).
 229
 230   normalizeEntropy inE modE = go $ modE identity
 231     where
 232       go _ (Leaf c) = Leaf c
 233       go f (Node c i children)
 234         | Map.null children =
 235             panic "normalizeEntropy: impossible"
 236         | otherwise         =
 237             Node c (f i) $ go (modE $ normalizeLevel es) <$> children
 238           where
 239             es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
 240   -}
 241 ------------------------------------------------------------------------
 242
 243 levels :: Trie k e -> [[Trie k e]]
 244 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 245   where
 246     subForest :: Trie k e -> [Trie k e]
 247     subForest (Leaf _)            = []
 248     subForest (Node _ _ children) = Map.elems children
 249
 250 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
 251 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
 252
 253 ------------------------------------------------------------------------
 254
 255 data Tries k e = Tries
 256   { _fwd :: Trie k e
 257   , _bwd :: Trie k e
 258   }
 259
 260 instance IsTrie Tries where
 261   buildTrie tts = Tries { _fwd = buildTrie tts
 262                         , _bwd = buildTrie (reverse <$> tts)
 263                         }
 264
 265   nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
 266
 267   findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
 268
 269   nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
 270
 271   normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
 272
 273 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
 274 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
 275
 276 ------------------------------------------------------------------------
 277 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
 278 split _   _  [] = []
 279 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
 280 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
 281   where
 282     consRev [] xss = xss
 283     consRev xs xss = reverse xs : xss
 284
 285     go _ pref []                  = [reverse pref]
 286     go _ pref (Terminal Stop:_)   = [reverse pref]
 287     go t pref (Terminal Start:xs) = go t pref xs
 288     go t pref (x:xs) =
 289         -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
 290         if acc
 291           then go xt (x:pref) xs
 292           else consRev pref $ go xt0 [x] xs
 293       where
 294         xt   = nodeChild x t
 295         xt0  = nodeChild x t0
 296         et   = ne 0 t
 297     --  ^ entropy of the current prefix
 298         ext0 = ne 0 xt0
 299     --  ^ entropy of [x]
 300         ext  = ne 0 xt
 301     --  ^ entropy of the current prefix plus x
 302         acc  = ext > et + ext0
 303         -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
 304
 305     ne d t = if P.isNaN e then d else e
 306       where e = nodeEntropy inE t
 307
 308 {-
 309 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
 310 split inE t0 ts =
 311   maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
 312 -}
 313
 314 ------------------------------------------------------------------------
 315 ------------------------------------------------------------------------
 316
 317 mainEleve :: Int -> [[Text]] -> [[[Text]]]
 318 mainEleve _ _ = []
 319 {-
 320 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
 321   where
 322     inp = toToken (n - 1) <$> input
 323     t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 324                           -- NP: here we use the entropy to split
 325                           -- instead we should use either:
 326                           --   info_norm_entropy or info_norm_entropy'
 327                           -- However they should first be fixed.
 328 -}
 329
 330 testEleve :: Bool -> Int -> [Text] -> IO Bool
 331 testEleve debug n output = do
 332   let
 333     out = T.words <$> output
 334     expected = fmap (T.splitOn "-") <$> out
 335     input = (T.splitOn "-" =<<) <$> out
 336     inp = toToken (n - 1) <$> input
 337     t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
 338     -- nt = normalizeEntropy  identity setNormEntropy (fwd :: Trie Token Double)
 339     -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
 340     nt = normalizeEntropy identity setNormEntropy
 341                           (t :: Trie Token Double)
 342     {-
 343     pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
 344           | ps <- L.nub $ [ c
 345                           | m <- [1..n]
 346                           , cs <- chunkAlong m 1 <$> inp
 347                           , c <- cs
 348                           ]
 349           ]
 350     -}
 351   --res = map unToken . split identity fwd <$> inp
 352   --res = map unToken . split info_norm_entropy' nt' <$> inp
 353     res = map unToken . split info_norm_entropy nt <$> inp
 354   when debug $ do
 355     P.putStrLn (show input)
 356     -- mapM_ (P.putStrLn . show) pss
 357     P.putStrLn ""
 358     printTrie nt
 359 {-
 360     printTrie (_fwd nt)
 361     printTrie (_bwd nt)
 362     -}
 363     P.putStrLn $ show res
 364   pure $ expected == res
 365
 366   where
 367     printTrie =
 368       P.putStrLn . Tree.drawTree
 369                  . fmap show
 370                  . toTree (NonTerminal "")
 371
 372 -- | TODO real data is a list of tokenized sentences
 373 example0, example1, example2, example3, example4, example5, example6 :: [Text]
 374 example0 =  ["New-York is New-York and New-York"]
 375 example1 =  ["to-be or not to-be"]
 376 example2 =  ["to-be-or not to-be-or NOT to-be and"]
 377 example3 =  example0 <> example0
 378        -- > TEST: Should not have York New in the trie
 379 example4 =  ["a-b-c-d e a-b-c-d f"]
 380 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
 381 example6 =  ["le-petit chat"
 382             ,"le-petit chien"
 383             ,"le-petit rat"
 384             ,"le gros rat"
 385             ]
 386
 387 runTests :: IO ()
 388 runTests =
 389   forM_
 390     [("example0", 2, example0)
 391     ,("example1", 2, example1)
 392     ,("example2", 3, example2)
 393     ,("example3", 2, example3)
 394     ,("example4", 4, example4)
 395     ,("example5", 5, example5)
 396     ]
 397     (\(name, n, ex) -> do
 398       b <- testEleve False n ex
 399       P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
 400     )