src/Gargantext/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12
  13 Stop words and (how to learn it).
  14
  15 Main type here is String.
  16
  17 -}
  18
  19 {-# LANGUAGE NoImplicitPrelude #-}
  20 {-# LANGUAGE OverloadedStrings #-}
  21
  22 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
  23   where
  24
  25 --import Data.Char (toLower)
  26 import qualified Data.List as DL
  27
  28 import Data.Maybe (maybe)
  29 import Data.Map.Strict (Map, toList)
  30 import qualified Data.Map.Strict as DM
  31
  32 import Data.String (String)
  33
  34 import Data.Text (Text)
  35 import Data.Text (pack, unpack, toLower)
  36 import Data.Tuple.Extra (both, second)
  37
  38 import Gargantext.Prelude
  39 import Gargantext.Core (Lang(..), allLangs)
  40 import Gargantext.Text.Terms.Mono (words)
  41 import Gargantext.Text.Metrics.Count (occurrencesWith)
  42
  43 import qualified Gargantext.Text.Samples.FR as FR
  44 import qualified Gargantext.Text.Samples.EN as EN
  45 --import qualified Gargantext.Text.Samples.DE as DE
  46 --import qualified Gargantext.Text.Samples.SP as SP
  47 --import qualified Gargantext.Text.Samples.CH as CH
  48
  49 ------------------------------------------------------------------------
  50 data Candidate = Candidate { stop :: Double
  51                            , noStop :: Double
  52  } deriving (Show)
  53
  54 ------------------------------------------------------------------------
  55 -- * Analyze candidate
  56 type StringSize = Int
  57 type TotalFreq  = Int
  58 type Freq       = Int
  59 type Word       = String
  60
  61 data CatWord a = CatWord a Word
  62 type CatProb a = Map     a Double
  63
  64 type Events a = Map a EventBook
  65
  66 ------------------------------------------------------------------------
  67 detectStopDefault :: Text -> Maybe Bool
  68 detectStopDefault = undefined
  69
  70 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  71 detectBool events = detectDefault False events
  72
  73 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  74 detectDefault = detectDefaultWith identity
  75
  76 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  77 detectDefaultWith f d events = detectDefaultWithPriors f ps
  78   where
  79     ps = priorEventsWith f d events
  80
  81 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  82 detectDefaultWithPriors f priors = detectCat 99 priors . f
  83
  84 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
  85 priorEventsWith f d e = toEvents d [0..2] 10 es
  86   where
  87     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
  88
  89
  90 ------------------------------------------------------------------------
  91 detectLangDefault :: Text -> Maybe Lang
  92 detectLangDefault = detectCat 99 eventLang
  93   where
  94     eventLang :: Events Lang
  95     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
  96
  97     langWord :: Lang -> CatWord Lang
  98     langWord l = CatWord l (textSample l)
  99
 100     textSample :: Lang -> String
 101     textSample EN = EN.textSample
 102     textSample FR = FR.textSample
 103     --textSample DE = DE.textSample
 104     --textSample SP = SP.textSample
 105     --textSample CH = CH.textSample
 106 ------------------------------------------------------------------------
 107 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 108 detectCat n es = head . map fst . (detectCat' n es) . unpack
 109   where
 110     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 111     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 112                                 $ toList
 113                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 114
 115
 116     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 117     detectWith n'' el (EventBook mapFreq _) =
 118       DM.unionsWith (+)
 119       $ map DM.fromList
 120       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 121       $ filter (\x -> fst x /= "  ")
 122       $ DM.toList mapFreq
 123
 124     -- | TODO: monoids (but proba >= 0)
 125     toPrior :: Int -> String -> Events a -> [(a, Double)]
 126     toPrior n'' s el = prior n'' $ pebLang s el
 127       where
 128         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 129         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 130
 131         peb :: String -> EventBook -> (Freq, TotalFreq)
 132         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 133           where
 134             a = maybe 0 identity $ DM.lookup st mapFreq
 135             b = maybe 1 identity $ DM.lookup (length st) mapN
 136
 137
 138     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 139     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 140                                     (map (\(a,b) -> a / b) ps')
 141       where
 142         (ls, ps'') = DL.unzip ps
 143         ps' = map (both fromIntegral) ps''
 144
 145     part :: (Eq p, Fractional p) => p -> p -> p
 146     part 0 _ = 0
 147     part _ 0 = 0
 148     part x y = x / y
 149
 150 {-
 151 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 152                  t (a, b) -> t (a, b)
 153 toProba xs = map (\(a,b) -> (a, part b total)) xs
 154   where
 155     total = sum $ map snd xs
 156 -}
 157 -- | TODO: monoids
 158 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 159 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 160   where
 161     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 162     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 163
 164     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 165     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 166
 167     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 168     opEvent f = DM.unionWith (op f)
 169
 170 ------------------------------------------------------------------------
 171 ------------------------------------------------------------------------
 172 data EventBook = EventBook { events_freq :: Map String     Freq
 173                            , events_n    :: Map StringSize TotalFreq
 174                            }
 175                              deriving (Show)
 176
 177 emptyEventBook :: [Int] -> Int -> EventBook
 178 emptyEventBook ns n = wordToBook ns n " "
 179
 180 wordsToBook :: [Int] -> Int -> String -> EventBook
 181 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 182   where
 183     ws = map unpack $ words $ pack txt
 184     eventsBook = map (wordToBook ns n) ws
 185
 186 wordToBook :: [Int] -> Int -> Word -> EventBook
 187 wordToBook ns n txt = EventBook ef en
 188   where
 189     chks = allChunks ns n txt
 190     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 191     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 192
 193 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 194 op f (EventBook ef1 en1)
 195      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 196                                      (DM.unionWith f en1 en2)
 197
 198 ------------------------------------------------------------------------
 199 ------------------------------------------------------------------------
 200 allChunks :: [Int] -> Int -> String -> [[String]]
 201 allChunks ns m st = map (\n -> chunks n m st) ns
 202
 203 -- | Chunks is the same function as splitBy in Context but for Strings,
 204 -- not Text (without pack and unpack operations that are not needed).
 205 chunks :: Int -> Int -> String -> [String]
 206 chunks n m = DL.take m . filter (not . all (== ' '))
 207                        . chunkAlong (n+1) 1
 208                        . DL.concat
 209                        . DL.take 1000
 210                        . DL.repeat
 211                        . blanks
 212
 213 -- | String preparation
 214 blanks :: String -> String
 215 blanks [] = []
 216 blanks xs = [' '] <> xs <> [' ']
 217
 218
 219 {-
 220 -- Some previous tests to be removed
 221 --import GHC.Base (Functor)
 222 --import Numeric.Probability.Distribution ((??))
 223 --import qualified Numeric.Probability.Distribution as D
 224
 225 -- | Blocks increase the size of the word to ease computations
 226 -- some border and unexepected effects can happen, need to be tested
 227 blockOf :: Int -> String -> String
 228 blockOf n = DL.concat . DL.take n . DL.repeat
 229
 230 -- * Make the distributions
 231 makeDist :: [String] -> D.T Double String
 232 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 233
 234 stopDist :: D.T Double String
 235 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 236
 237 candDist :: D.T Double String
 238 candDist = makeDist candList
 239
 240 ------------------------------------------------------------------------
 241 sumProba :: Num a => D.T a String -> [Char] -> a
 242 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 243
 244 -- | Get probability according a distribution
 245 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 246 (~?) ds x = (==x) ?? ds
 247
 248 ------------------------------------------------------------------------
 249 candidate :: [Char] -> Candidate
 250 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 251
 252 ------------------------------------------------------------------------
 253 candList :: [String]
 254 candList = [ "france", "alexandre", "mael", "constitution"
 255            , "etats-unis", "associes", "car", "train", "spam"]
 256
 257 --}