src/Gargantext/Core/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12 - Stop words and (how to learn it).
  13 - Main type here is String check if Chars on Text would be optimized
  14
  15 -}
  16
  17 {-# LANGUAGE TypeSynonymInstances #-}
  18
  19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
  20   where
  21
  22 import Codec.Serialise
  23 import qualified Data.List as DL
  24
  25 import Data.Map.Strict (Map, toList)
  26 import qualified Data.Map.Strict as DM
  27
  28 import GHC.Generics
  29 import Data.String (String)
  30
  31 import Data.Text (Text)
  32 import Data.Text (pack, unpack, toLower)
  33 import Data.Tuple.Extra (both)
  34 import qualified Data.ByteString.Lazy as BSL
  35
  36 import Gargantext.Prelude
  37 import Gargantext.Database.GargDB
  38 import Gargantext.Core (Lang(..), allLangs)
  39 import Gargantext.Core.Text.Terms.Mono (words)
  40 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
  41
  42 import qualified Gargantext.Core.Text.Samples.CN as CN
  43 import qualified Gargantext.Core.Text.Samples.DE as DE
  44 import qualified Gargantext.Core.Text.Samples.EN as EN
  45 import qualified Gargantext.Core.Text.Samples.ES as ES
  46 import qualified Gargantext.Core.Text.Samples.FR as FR
  47 import qualified Gargantext.Core.Text.Samples.PL as PL
  48
  49 ------------------------------------------------------------------------
  50 data Candidate = Candidate { stop :: Double
  51                            , noStop :: Double
  52  } deriving (Show)
  53
  54 ------------------------------------------------------------------------
  55 -- * Analyze candidate
  56 type StringSize = Int
  57 type TotalFreq  = Int
  58 type Freq       = Int
  59 type Word       = String
  60
  61 data CatWord a = CatWord a Word
  62 type CatProb a = Map     a Double
  63
  64 type Events a = Map a EventBook
  65 ------------------------------------------------------------------------
  66 data EventBook = EventBook { events_freq :: Map String     Freq
  67                            , events_n    :: Map StringSize TotalFreq
  68                            }
  69                              deriving (Show, Generic)
  70
  71 instance Serialise EventBook
  72
  73 instance (Serialise a, Ord a) => SaveFile (Events a) where
  74   saveFile' f d = BSL.writeFile f (serialise d)
  75
  76 instance (Serialise a, Ord a) => ReadFile (Events a) where
  77   readFile' filepath = deserialise <$> BSL.readFile filepath
  78
  79 ------------------------------------------------------------------------
  80 detectStopDefault :: Text -> Maybe Bool
  81 detectStopDefault = undefined
  82
  83 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  84 detectBool events = detectDefault False events
  85
  86 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  87 detectDefault = detectDefaultWith identity
  88
  89 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  90 detectDefaultWith f d events = detectDefaultWithPriors f ps
  91   where
  92     ps = priorEventsWith f d events
  93
  94 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  95 detectDefaultWithPriors f priors = detectCat 99 priors . f
  96
  97 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
  98 priorEventsWith f d e = toEvents d [0..2] 10 es
  99   where
 100     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
 101
 102
 103 ------------------------------------------------------------------------
 104 detectLangDefault :: Text -> Maybe Lang
 105 detectLangDefault = detectCat 99 eventLang
 106   where
 107     eventLang :: Events Lang
 108     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
 109
 110     langWord :: Lang -> CatWord Lang
 111     langWord l = CatWord l (textSample l)
 112
 113     textSample :: Lang -> String
 114     textSample EN = EN.textSample
 115     textSample FR = FR.textSample
 116     textSample DE = DE.textSample
 117     textSample ES = ES.textSample
 118     textSample CN = CN.textSample
 119     textSample PL = PL.textSample
 120     textSample _  = panic "[G.C.T.L:detectLangDefault] not impl yet"
 121     --textSample DE = DE.textSample
 122     --textSample SP = SP.textSample
 123     --textSample CH = CH.textSample
 124 ------------------------------------------------------------------------
 125 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 126 detectCat n es = head . map fst . (detectCat' n es) . unpack
 127   where
 128     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 129     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 130                                 $ toList
 131                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 132
 133
 134     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 135     detectWith n'' el (EventBook mapFreq _) =
 136       DM.unionsWith (+)
 137       $ map DM.fromList
 138       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 139       $ filter (\x -> fst x /= "  ")
 140       $ DM.toList mapFreq
 141
 142     -- | TODO: monoids (but proba >= 0)
 143     toPrior :: Int -> String -> Events a -> [(a, Double)]
 144     toPrior n'' s el = prior n'' $ pebLang s el
 145       where
 146         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 147         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 148
 149         peb :: String -> EventBook -> (Freq, TotalFreq)
 150         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 151           where
 152             a = maybe 0 identity $ DM.lookup st mapFreq
 153             b = maybe 1 identity $ DM.lookup (length st) mapN
 154
 155
 156     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 157     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 158                                     (map (\(a,b) -> a / b) ps')
 159       where
 160         (ls, ps'') = DL.unzip ps
 161         ps' = map (both fromIntegral) ps''
 162
 163     part :: (Eq p, Fractional p) => p -> p -> p
 164     part 0 _ = 0
 165     part _ 0 = 0
 166     part x y = x / y
 167
 168 {-
 169 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 170                  t (a, b) -> t (a, b)
 171 toProba xs = map (\(a,b) -> (a, part b total)) xs
 172   where
 173     total = sum $ map snd xs
 174 -}
 175 -- | TODO: monoids
 176 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 177 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 178   where
 179     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 180     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 181
 182     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 183     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 184
 185     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 186     opEvent f = DM.unionWith (op f)
 187
 188 ------------------------------------------------------------------------
 189
 190 emptyEventBook :: [Int] -> Int -> EventBook
 191 emptyEventBook ns n = wordToBook ns n " "
 192
 193 wordsToBook :: [Int] -> Int -> String -> EventBook
 194 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 195   where
 196     ws = map unpack $ words $ pack txt
 197     eventsBook = map (wordToBook ns n) ws
 198
 199 wordToBook :: [Int] -> Int -> Word -> EventBook
 200 wordToBook ns n txt = EventBook ef en
 201   where
 202     chks = allChunks ns n txt
 203     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 204     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 205
 206 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 207 op f (EventBook ef1 en1)
 208      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 209                                      (DM.unionWith f en1 en2)
 210
 211 ------------------------------------------------------------------------
 212 ------------------------------------------------------------------------
 213 allChunks :: [Int] -> Int -> String -> [[String]]
 214 allChunks ns m st = map (\n -> chunks n m st) ns
 215
 216 -- | Chunks is the same function as splitBy in Context but for Strings,
 217 -- not Text (without pack and unpack operations that are not needed).
 218 chunks :: Int -> Int -> String -> [String]
 219 chunks n m = DL.take m . filter (not . all (== ' '))
 220                        . chunkAlong (n+1) 1
 221                        . DL.concat
 222                        . DL.take 1000
 223                        . DL.repeat
 224                        . blanks
 225
 226 -- | String preparation
 227 blanks :: String -> String
 228 blanks [] = []
 229 blanks xs = [' '] <> xs <> [' ']
 230
 231
 232 {-
 233 -- Some previous tests to be removed
 234 --import GHC.Base (Functor)
 235 --import Numeric.Probability.Distribution ((??))
 236 --import qualified Numeric.Probability.Distribution as D
 237
 238 -- | Blocks increase the size of the word to ease computations
 239 -- some border and unexepected effects can happen, need to be tested
 240 blockOf :: Int -> String -> String
 241 blockOf n = DL.concat . DL.take n . DL.repeat
 242
 243 -- * Make the distributions
 244 makeDist :: [String] -> D.T Double String
 245 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 246
 247 stopDist :: D.T Double String
 248 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 249
 250 candDist :: D.T Double String
 251 candDist = makeDist candList
 252
 253 ------------------------------------------------------------------------
 254 sumProba :: Num a => D.T a String -> [Char] -> a
 255 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 256
 257 -- | Get probability according a distribution
 258 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 259 (~?) ds x = (==x) ?? ds
 260
 261 ------------------------------------------------------------------------
 262 candidate :: [Char] -> Candidate
 263 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 264
 265 ------------------------------------------------------------------------
 266 candList :: [String]
 267 candList = [ "france", "alexandre", "mael", "constitution"
 268            , "etats-unis", "associes", "car", "train", "spam"]
 269
 270 --}