{-|
Module      : Gargantext.Text.Eleve
Description : Unsupervized Word segmentation
Copyright   : (c) CNRS, 2019-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

# Implementation of Unsupervized Word Segmentation

References:

- Python implementation (Korantin August, Emmanuel Navarro):
  [EleVe](https://github.com/kodexlab/eleve.git)

- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
  the 50th Annual Meeting of the Association for Computational Linguistics
  , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)

Notes for current implementation:
- TODO fix normalization
- TODO extract longer ngrams (see paper above, viterbi algo can be used)
- TODO AD TEST: prop (Node c _e f) = c == Map.size f

- AD: Real ngrams extraction test
  from Gargantext.Text.Terms import extractTermsUnsupervised
  docs <- runCmdRepl $ selectDocs 1004
  extractTermsUnsupervised 3 $ DT.intercalate " "
                        $ catMaybes
                        $ Gargantext.map _hyperdataDocument_abstract docs

-}
{-# LANGUAGE ConstraintKinds   #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes        #-}
{-# LANGUAGE TemplateHaskell   #-}

module Gargantext.Text.Eleve where

import Debug.Trace (trace)
-- import Debug.SimpleReflect

import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
import Control.Monad (foldM, mapM_, forM_)
import Data.Ord (Ord)
import qualified Data.List as L
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import Data.Map (Map)
import Data.Maybe (fromMaybe, catMaybes)
import qualified Data.Map as Map
import Gargantext.Prelude hiding (cs)
import qualified Data.Tree as Tree
import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)

type Entropy e =
  ( Fractional e
  , Floating e
  , P.RealFloat e
  , Show e
  -- ^ TODO: only used for debugging
  )
------------------------------------------------------------------------
-- | Example and tests for development
data I e = I
  { _info_entropy       :: e
  , _info_norm_entropy  :: e
  }

instance Show e => Show (I e) where
  show (I e n) = show (e, n)

makeLenses ''I

type ModEntropy i o e = (e -> e) -> i -> o

setNormEntropy :: ModEntropy e (I e) e
setNormEntropy f e = I e (f e)

data StartStop = Start | Stop
  deriving (Ord, Eq, Show)

data Token = NonTerminal Text
           | Terminal StartStop
  deriving (Ord, Eq, Show)

isTerminal :: Token -> Bool
isTerminal (Terminal    _) = True
isTerminal (NonTerminal _) = False

toToken :: Int -> [Text] -> [Token]
toToken n xs = Terminal Start : (NonTerminal <$> xs) <> L.take n (repeat $ Terminal Stop)

unToken :: [Token] -> [Text]
unToken = map f
  where
    f (NonTerminal x) = x
    f (Terminal _)    = ""

------------------------------------------------------------------------

data Trie k e
  = Node { _node_count    :: Int
         , _node_entropy  :: e
         , _node_children :: Map k (Trie k e)
         }
 | Leaf { _node_count    :: Int }
  deriving (Show)

makeLenses ''Trie

insertTries :: Ord k => [[k]] -> Trie k ()
insertTries = L.foldr insertTrie emptyTrie

insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertTrie []     n                    = n { _node_count = _node_count n +1}
insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
  where
    f = Just . insertTrie xs . fromMaybe emptyTrie

-- emptyTrie :: (Ord k, Monoid e) => Trie k e
-- emptyTrie = Node 0 mempty mempty
emptyTrie :: Trie k e
emptyTrie  = Leaf 0

mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
mkTrie c children
  | Map.null children = Leaf c
  | otherwise         = Node c mempty children

                        -----------------------------

-- | Trie to Tree since Tree as nice print function
toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)

------------------------------------------------------------------------
------------------------------------------------------------------------

nan :: Floating e => e
nan = 0 / 0

updateIfDefined :: P.RealFloat e => e -> e -> e
updateIfDefined e0 e | P.isNaN e = e0
                     | otherwise = e

entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
entropyTrie _    (Leaf c)             = Leaf c
entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
  where
    e = sum $ map f $ Map.toList children
    f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
                             else - chc * P.logBase 2 chc
      where
        chc = fromIntegral (_node_count child) / fromIntegral c
------------------------------------------------------------------------

normalizeLevel :: Entropy e => [e] -> e -> e
normalizeLevel = checkDiff (go . filter (not . P.isNaN))

  where
    -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
    checkDiff = identity
    go []  = panic "normalizeLevel: impossible"
                        -- trace "normalizeLevel"
--    go [_] = identity
    go es  = \e -> (e - m) / v
{-
                              in if P.isNaN e'
                                  then trace ("normalizeLevel " <> show (e,m,v,es))
                                      e
                                  else e'
-}
      where
        m  = mean      es
        v  = deviation es

{- Unused

nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren (Node _ _ cs) = cs
nodeChildren (Leaf _)      = Map.empty

-}

class IsTrie trie where
  buildTrie :: Floating e => [[Token]] -> trie Token e
  nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
  nodeChild :: Ord k => k -> trie k e -> trie k e
  findTrie :: Ord k => [k] -> trie k e -> trie k e
  normalizeEntropy :: Entropy e
                   => Getting e i e -> ModEntropy i o e
                   -> trie k i -> trie k o

  nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
  nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t

instance IsTrie Trie where
  buildTrie = entropyTrie isTerminal . insertTries

  nodeEntropy inE (Node _ e _) = e ^. inE
  nodeEntropy _   (Leaf _)     = -- trace "nodeEntropy of Leaf" $
                                 nan

  nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
  nodeChild _ (Leaf _)      = emptyTrie

  findTrie ks t = L.foldl (flip nodeChild) t ks

  normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
    where
      go _ []         _                   = panic "normalizeEntropy' empty levels"
      go _ _          (Leaf c)            = Leaf c
      go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
      go f (es : ess) (Node c i children) =
          Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children


  {-
  This is only normalizing a node with respect to its brothers (unlike all the
  nodes of the same level).

  normalizeEntropy inE modE = go $ modE identity
    where
      go _ (Leaf c) = Leaf c
      go f (Node c i children)
        | Map.null children =
            panic "normalizeEntropy: impossible"
        | otherwise         =
            Node c (f i) $ go (modE $ normalizeLevel es) <$> children
          where
            es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
  -}
------------------------------------------------------------------------

levels :: Trie k e -> [[Trie k e]]
levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
  where
    subForest :: Trie k e -> [Trie k e]
    subForest (Leaf _)            = []
    subForest (Node _ _ children) = Map.elems children

entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels

------------------------------------------------------------------------

data Tries k e = Tries
  { _fwd :: Trie k e
  , _bwd :: Trie k e
  }

instance IsTrie Tries where
  buildTrie tts = Tries { _fwd = buildTrie tts
                        , _bwd = buildTrie (reverse <$> tts)
                        }

  nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]

  findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)

  nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)

  normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)

onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)

------------------------------------------------------------------------
split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
split _   _  [] = []
split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
  where
    consRev [] xss = xss
    consRev xs xss = reverse xs : xss

    go _ pref []                  = [reverse pref]
    go _ pref (Terminal Stop:_)   = [reverse pref]
    go t pref (Terminal Start:xs) = go t pref xs
    go t pref (x:xs) =
        -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
        if acc
          then go xt (x:pref) xs
          else consRev pref $ go xt0 [x] xs
      where
        xt   = nodeChild x t
        xt0  = nodeChild x t0
        et   = ne 0 t
    --  ^ entropy of the current prefix
        ext0 = ne 0 xt0
    --  ^ entropy of [x]
        ext  = ne 0 xt
    --  ^ entropy of the current prefix plus x
        acc  = ext > et + ext0
        -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])

    ne d t = if P.isNaN e then d else e
      where e = nodeEntropy inE t

{-
split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
split inE t0 ts =
  maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
-}

------------------------------------------------------------------------
------------------------------------------------------------------------

mainEleve :: Int -> [[Text]] -> [[[Text]]]
mainEleve _ _ = []
{-
mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
  where
    inp = toToken (n - 1) <$> input
    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
                          -- NP: here we use the entropy to split
                          -- instead we should use either:
                          --   info_norm_entropy or info_norm_entropy'
                          -- However they should first be fixed.
-}

testEleve :: Bool -> Int -> [Text] -> IO Bool
testEleve debug n output = do
  let
    out = T.words <$> output
    expected = fmap (T.splitOn "-") <$> out
    input = (T.splitOn "-" =<<) <$> out
    inp = toToken (n - 1) <$> input
    t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
    -- nt = normalizeEntropy  identity setNormEntropy (fwd :: Trie Token Double)
    -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
    nt = normalizeEntropy identity setNormEntropy
                          (t :: Trie Token Double)
    {-
    pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
          | ps <- L.nub $ [ c
                          | m <- [1..n]
                          , cs <- chunkAlong m 1 <$> inp
                          , c <- cs
                          ]
          ]
    -}
  --res = map unToken . split identity fwd <$> inp
  --res = map unToken . split info_norm_entropy' nt' <$> inp
    res = map unToken . split info_norm_entropy nt <$> inp
  when debug $ do
    P.putStrLn (show input)
    -- mapM_ (P.putStrLn . show) pss
    P.putStrLn ""
    printTrie nt
{-
    printTrie (_fwd nt)
    printTrie (_bwd nt)
    -}
    P.putStrLn $ show res
  pure $ expected == res

  where
    printTrie =
      P.putStrLn . Tree.drawTree
                 . fmap show
                 . toTree (NonTerminal "")

-- | TODO real data is a list of tokenized sentences
example0, example1, example2, example3, example4, example5, example6 :: [Text]
example0 =  ["New-York is New-York and New-York"]
example1 =  ["to-be or not to-be"]
example2 =  ["to-be-or not to-be-or NOT to-be and"]
example3 =  example0 <> example0
       -- > TEST: Should not have York New in the trie
example4 =  ["a-b-c-d e a-b-c-d f"]
example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
example6 =  ["le-petit chat"
            ,"le-petit chien"
            ,"le-petit rat"
            ,"le gros rat"
            ]

runTests :: IO ()
runTests =
  forM_
    [("example0", 2, example0)
    ,("example1", 2, example1)
    ,("example2", 3, example2)
    ,("example3", 2, example3)
    ,("example4", 4, example4)
    ,("example5", 5, example5)
    ]
    (\(name, n, ex) -> do
      b <- testEleve False n ex
      P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
    )