src/Gargantext/Core/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12 - Stop words and (how to learn it).
  13 - Main type here is String check if Chars on Text would be optimized
  14
  15 -}
  16
  17 {-# LANGUAGE TypeSynonymInstances #-}
  18
  19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
  20   where
  21
  22 import Codec.Serialise
  23 import qualified Data.List as DL
  24
  25 import Data.Map.Strict (Map, toList)
  26 import qualified Data.Map.Strict as DM
  27
  28 import GHC.Generics
  29 import Data.String (String)
  30
  31 import Data.Text (Text)
  32 import Data.Text (pack, unpack, toLower)
  33 import Data.Tuple.Extra (both)
  34 import qualified Data.ByteString.Lazy as BSL
  35
  36 import Gargantext.Prelude
  37 import Gargantext.Database.GargDB
  38 import Gargantext.Core (Lang(..), allLangs)
  39 import Gargantext.Core.Text.Terms.Mono (words)
  40 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
  41
  42 import qualified Gargantext.Core.Text.Samples.FR as FR
  43 import qualified Gargantext.Core.Text.Samples.EN as EN
  44 --import qualified Gargantext.Core.Text.Samples.DE as DE
  45 --import qualified Gargantext.Core.Text.Samples.SP as SP
  46 --import qualified Gargantext.Core.Text.Samples.CH as CH
  47
  48 ------------------------------------------------------------------------
  49 data Candidate = Candidate { stop :: Double
  50                            , noStop :: Double
  51  } deriving (Show)
  52
  53 ------------------------------------------------------------------------
  54 -- * Analyze candidate
  55 type StringSize = Int
  56 type TotalFreq  = Int
  57 type Freq       = Int
  58 type Word       = String
  59
  60 data CatWord a = CatWord a Word
  61 type CatProb a = Map     a Double
  62
  63 type Events a = Map a EventBook
  64 ------------------------------------------------------------------------
  65 data EventBook = EventBook { events_freq :: Map String     Freq
  66                            , events_n    :: Map StringSize TotalFreq
  67                            }
  68                              deriving (Show, Generic)
  69
  70 instance Serialise EventBook
  71
  72 instance (Serialise a, Ord a) => SaveFile (Events a) where
  73   saveFile' f d = BSL.writeFile f (serialise d)
  74
  75 instance (Serialise a, Ord a) => ReadFile (Events a) where
  76   readFile' filepath = deserialise <$> BSL.readFile filepath
  77
  78 ------------------------------------------------------------------------
  79 detectStopDefault :: Text -> Maybe Bool
  80 detectStopDefault = undefined
  81
  82 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  83 detectBool events = detectDefault False events
  84
  85 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  86 detectDefault = detectDefaultWith identity
  87
  88 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  89 detectDefaultWith f d events = detectDefaultWithPriors f ps
  90   where
  91     ps = priorEventsWith f d events
  92
  93 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  94 detectDefaultWithPriors f priors = detectCat 99 priors . f
  95
  96 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
  97 priorEventsWith f d e = toEvents d [0..2] 10 es
  98   where
  99     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
 100
 101
 102 ------------------------------------------------------------------------
 103 detectLangDefault :: Text -> Maybe Lang
 104 detectLangDefault = detectCat 99 eventLang
 105   where
 106     eventLang :: Events Lang
 107     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
 108
 109     langWord :: Lang -> CatWord Lang
 110     langWord l = CatWord l (textSample l)
 111
 112     textSample :: Lang -> String
 113     textSample EN = EN.textSample
 114     textSample FR = FR.textSample
 115     textSample _  = panic "[G.C.T.L:detectLangDefault] not impl yet"
 116     --textSample DE = DE.textSample
 117     --textSample SP = SP.textSample
 118     --textSample CH = CH.textSample
 119 ------------------------------------------------------------------------
 120 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 121 detectCat n es = head . map fst . (detectCat' n es) . unpack
 122   where
 123     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 124     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 125                                 $ toList
 126                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 127
 128
 129     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 130     detectWith n'' el (EventBook mapFreq _) =
 131       DM.unionsWith (+)
 132       $ map DM.fromList
 133       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 134       $ filter (\x -> fst x /= "  ")
 135       $ DM.toList mapFreq
 136
 137     -- | TODO: monoids (but proba >= 0)
 138     toPrior :: Int -> String -> Events a -> [(a, Double)]
 139     toPrior n'' s el = prior n'' $ pebLang s el
 140       where
 141         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 142         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 143
 144         peb :: String -> EventBook -> (Freq, TotalFreq)
 145         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 146           where
 147             a = maybe 0 identity $ DM.lookup st mapFreq
 148             b = maybe 1 identity $ DM.lookup (length st) mapN
 149
 150
 151     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 152     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 153                                     (map (\(a,b) -> a / b) ps')
 154       where
 155         (ls, ps'') = DL.unzip ps
 156         ps' = map (both fromIntegral) ps''
 157
 158     part :: (Eq p, Fractional p) => p -> p -> p
 159     part 0 _ = 0
 160     part _ 0 = 0
 161     part x y = x / y
 162
 163 {-
 164 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 165                  t (a, b) -> t (a, b)
 166 toProba xs = map (\(a,b) -> (a, part b total)) xs
 167   where
 168     total = sum $ map snd xs
 169 -}
 170 -- | TODO: monoids
 171 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 172 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 173   where
 174     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 175     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 176
 177     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 178     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 179
 180     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 181     opEvent f = DM.unionWith (op f)
 182
 183 ------------------------------------------------------------------------
 184
 185 emptyEventBook :: [Int] -> Int -> EventBook
 186 emptyEventBook ns n = wordToBook ns n " "
 187
 188 wordsToBook :: [Int] -> Int -> String -> EventBook
 189 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 190   where
 191     ws = map unpack $ words $ pack txt
 192     eventsBook = map (wordToBook ns n) ws
 193
 194 wordToBook :: [Int] -> Int -> Word -> EventBook
 195 wordToBook ns n txt = EventBook ef en
 196   where
 197     chks = allChunks ns n txt
 198     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 199     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 200
 201 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 202 op f (EventBook ef1 en1)
 203      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 204                                      (DM.unionWith f en1 en2)
 205
 206 ------------------------------------------------------------------------
 207 ------------------------------------------------------------------------
 208 allChunks :: [Int] -> Int -> String -> [[String]]
 209 allChunks ns m st = map (\n -> chunks n m st) ns
 210
 211 -- | Chunks is the same function as splitBy in Context but for Strings,
 212 -- not Text (without pack and unpack operations that are not needed).
 213 chunks :: Int -> Int -> String -> [String]
 214 chunks n m = DL.take m . filter (not . all (== ' '))
 215                        . chunkAlong (n+1) 1
 216                        . DL.concat
 217                        . DL.take 1000
 218                        . DL.repeat
 219                        . blanks
 220
 221 -- | String preparation
 222 blanks :: String -> String
 223 blanks [] = []
 224 blanks xs = [' '] <> xs <> [' ']
 225
 226
 227 {-
 228 -- Some previous tests to be removed
 229 --import GHC.Base (Functor)
 230 --import Numeric.Probability.Distribution ((??))
 231 --import qualified Numeric.Probability.Distribution as D
 232
 233 -- | Blocks increase the size of the word to ease computations
 234 -- some border and unexepected effects can happen, need to be tested
 235 blockOf :: Int -> String -> String
 236 blockOf n = DL.concat . DL.take n . DL.repeat
 237
 238 -- * Make the distributions
 239 makeDist :: [String] -> D.T Double String
 240 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 241
 242 stopDist :: D.T Double String
 243 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 244
 245 candDist :: D.T Double String
 246 candDist = makeDist candList
 247
 248 ------------------------------------------------------------------------
 249 sumProba :: Num a => D.T a String -> [Char] -> a
 250 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 251
 252 -- | Get probability according a distribution
 253 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 254 (~?) ds x = (==x) ?? ds
 255
 256 ------------------------------------------------------------------------
 257 candidate :: [Char] -> Candidate
 258 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 259
 260 ------------------------------------------------------------------------
 261 candList :: [String]
 262 candList = [ "france", "alexandre", "mael", "constitution"
 263            , "etats-unis", "associes", "car", "train", "spam"]
 264
 265 --}