src/Gargantext/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12 - Stop words and (how to learn it).
  13 - Main type here is String check if Chars on Text would be optimized
  14
  15 -}
  16
  17 {-# LANGUAGE NoImplicitPrelude #-}
  18 {-# LANGUAGE OverloadedStrings #-}
  19 {-# LANGUAGE DeriveGeneric #-}
  20 {-# LANGUAGE TypeSynonymInstances #-}
  21 {-# LANGUAGE FlexibleInstances #-}
  22
  23 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
  24   where
  25
  26 import Codec.Serialise
  27 import qualified Data.List as DL
  28
  29 import Data.Maybe (maybe)
  30 import Data.Map.Strict (Map, toList)
  31 import qualified Data.Map.Strict as DM
  32
  33 import GHC.Generics
  34 import Data.String (String)
  35
  36 import Data.Text (Text)
  37 import Data.Text (pack, unpack, toLower)
  38 import Data.Tuple.Extra (both)
  39 import qualified Data.ByteString.Lazy as BSL
  40
  41 import Gargantext.Prelude
  42 import Gargantext.Prelude.Utils
  43 import Gargantext.Core (Lang(..), allLangs)
  44 import Gargantext.Text.Terms.Mono (words)
  45 import Gargantext.Text.Metrics.Count (occurrencesWith)
  46
  47 import qualified Gargantext.Text.Samples.FR as FR
  48 import qualified Gargantext.Text.Samples.EN as EN
  49 --import qualified Gargantext.Text.Samples.DE as DE
  50 --import qualified Gargantext.Text.Samples.SP as SP
  51 --import qualified Gargantext.Text.Samples.CH as CH
  52
  53 ------------------------------------------------------------------------
  54 data Candidate = Candidate { stop :: Double
  55                            , noStop :: Double
  56  } deriving (Show)
  57
  58 ------------------------------------------------------------------------
  59 -- * Analyze candidate
  60 type StringSize = Int
  61 type TotalFreq  = Int
  62 type Freq       = Int
  63 type Word       = String
  64
  65 data CatWord a = CatWord a Word
  66 type CatProb a = Map     a Double
  67
  68 type Events a = Map a EventBook
  69 ------------------------------------------------------------------------
  70 data EventBook = EventBook { events_freq :: Map String     Freq
  71                            , events_n    :: Map StringSize TotalFreq
  72                            }
  73                              deriving (Show, Generic)
  74
  75 instance Serialise EventBook
  76
  77 instance (Serialise a, Ord a) => SaveFile (Events a) where
  78   saveFile' f d = BSL.writeFile f (serialise d)
  79
  80 instance (Serialise a, Ord a) => ReadFile (Events a) where
  81   readFile' filepath = deserialise <$> BSL.readFile filepath
  82
  83 ------------------------------------------------------------------------
  84 detectStopDefault :: Text -> Maybe Bool
  85 detectStopDefault = undefined
  86
  87 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  88 detectBool events = detectDefault False events
  89
  90 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  91 detectDefault = detectDefaultWith identity
  92
  93 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  94 detectDefaultWith f d events = detectDefaultWithPriors f ps
  95   where
  96     ps = priorEventsWith f d events
  97
  98 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  99 detectDefaultWithPriors f priors = detectCat 99 priors . f
 100
 101 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
 102 priorEventsWith f d e = toEvents d [0..2] 10 es
 103   where
 104     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
 105
 106
 107 ------------------------------------------------------------------------
 108 detectLangDefault :: Text -> Maybe Lang
 109 detectLangDefault = detectCat 99 eventLang
 110   where
 111     eventLang :: Events Lang
 112     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
 113
 114     langWord :: Lang -> CatWord Lang
 115     langWord l = CatWord l (textSample l)
 116
 117     textSample :: Lang -> String
 118     textSample EN = EN.textSample
 119     textSample FR = FR.textSample
 120     textSample _  = panic "textSample: not impl yet"
 121     --textSample DE = DE.textSample
 122     --textSample SP = SP.textSample
 123     --textSample CH = CH.textSample
 124 ------------------------------------------------------------------------
 125 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 126 detectCat n es = head . map fst . (detectCat' n es) . unpack
 127   where
 128     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 129     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 130                                 $ toList
 131                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 132
 133
 134     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 135     detectWith n'' el (EventBook mapFreq _) =
 136       DM.unionsWith (+)
 137       $ map DM.fromList
 138       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 139       $ filter (\x -> fst x /= "  ")
 140       $ DM.toList mapFreq
 141
 142     -- | TODO: monoids (but proba >= 0)
 143     toPrior :: Int -> String -> Events a -> [(a, Double)]
 144     toPrior n'' s el = prior n'' $ pebLang s el
 145       where
 146         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 147         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 148
 149         peb :: String -> EventBook -> (Freq, TotalFreq)
 150         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 151           where
 152             a = maybe 0 identity $ DM.lookup st mapFreq
 153             b = maybe 1 identity $ DM.lookup (length st) mapN
 154
 155
 156     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 157     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 158                                     (map (\(a,b) -> a / b) ps')
 159       where
 160         (ls, ps'') = DL.unzip ps
 161         ps' = map (both fromIntegral) ps''
 162
 163     part :: (Eq p, Fractional p) => p -> p -> p
 164     part 0 _ = 0
 165     part _ 0 = 0
 166     part x y = x / y
 167
 168 {-
 169 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 170                  t (a, b) -> t (a, b)
 171 toProba xs = map (\(a,b) -> (a, part b total)) xs
 172   where
 173     total = sum $ map snd xs
 174 -}
 175 -- | TODO: monoids
 176 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 177 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 178   where
 179     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 180     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 181
 182     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 183     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 184
 185     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 186     opEvent f = DM.unionWith (op f)
 187
 188 ------------------------------------------------------------------------
 189
 190 emptyEventBook :: [Int] -> Int -> EventBook
 191 emptyEventBook ns n = wordToBook ns n " "
 192
 193 wordsToBook :: [Int] -> Int -> String -> EventBook
 194 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 195   where
 196     ws = map unpack $ words $ pack txt
 197     eventsBook = map (wordToBook ns n) ws
 198
 199 wordToBook :: [Int] -> Int -> Word -> EventBook
 200 wordToBook ns n txt = EventBook ef en
 201   where
 202     chks = allChunks ns n txt
 203     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 204     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 205
 206 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 207 op f (EventBook ef1 en1)
 208      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 209                                      (DM.unionWith f en1 en2)
 210
 211 ------------------------------------------------------------------------
 212 ------------------------------------------------------------------------
 213 allChunks :: [Int] -> Int -> String -> [[String]]
 214 allChunks ns m st = map (\n -> chunks n m st) ns
 215
 216 -- | Chunks is the same function as splitBy in Context but for Strings,
 217 -- not Text (without pack and unpack operations that are not needed).
 218 chunks :: Int -> Int -> String -> [String]
 219 chunks n m = DL.take m . filter (not . all (== ' '))
 220                        . chunkAlong (n+1) 1
 221                        . DL.concat
 222                        . DL.take 1000
 223                        . DL.repeat
 224                        . blanks
 225
 226 -- | String preparation
 227 blanks :: String -> String
 228 blanks [] = []
 229 blanks xs = [' '] <> xs <> [' ']
 230
 231
 232 {-
 233 -- Some previous tests to be removed
 234 --import GHC.Base (Functor)
 235 --import Numeric.Probability.Distribution ((??))
 236 --import qualified Numeric.Probability.Distribution as D
 237
 238 -- | Blocks increase the size of the word to ease computations
 239 -- some border and unexepected effects can happen, need to be tested
 240 blockOf :: Int -> String -> String
 241 blockOf n = DL.concat . DL.take n . DL.repeat
 242
 243 -- * Make the distributions
 244 makeDist :: [String] -> D.T Double String
 245 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 246
 247 stopDist :: D.T Double String
 248 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 249
 250 candDist :: D.T Double String
 251 candDist = makeDist candList
 252
 253 ------------------------------------------------------------------------
 254 sumProba :: Num a => D.T a String -> [Char] -> a
 255 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 256
 257 -- | Get probability according a distribution
 258 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 259 (~?) ds x = (==x) ?? ds
 260
 261 ------------------------------------------------------------------------
 262 candidate :: [Char] -> Candidate
 263 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 264
 265 ------------------------------------------------------------------------
 266 candList :: [String]
 267 candList = [ "france", "alexandre", "mael", "constitution"
 268            , "etats-unis", "associes", "car", "train", "spam"]
 269
 270 --}