src/Gargantext/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12 - Stop words and (how to learn it).
  13 - Main type here is String check if Chars on Text would be optimized
  14
  15 -}
  16
  17 {-# LANGUAGE TypeSynonymInstances #-}
  18
  19 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
  20   where
  21
  22 import Codec.Serialise
  23 import qualified Data.List as DL
  24
  25 import Data.Maybe (maybe)
  26 import Data.Map.Strict (Map, toList)
  27 import qualified Data.Map.Strict as DM
  28
  29 import GHC.Generics
  30 import Data.String (String)
  31
  32 import Data.Text (Text)
  33 import Data.Text (pack, unpack, toLower)
  34 import Data.Tuple.Extra (both)
  35 import qualified Data.ByteString.Lazy as BSL
  36
  37 import Gargantext.Prelude
  38 import Gargantext.Prelude.Utils
  39 import Gargantext.Core (Lang(..), allLangs)
  40 import Gargantext.Text.Terms.Mono (words)
  41 import Gargantext.Text.Metrics.Count (occurrencesWith)
  42
  43 import qualified Gargantext.Text.Samples.FR as FR
  44 import qualified Gargantext.Text.Samples.EN as EN
  45 --import qualified Gargantext.Text.Samples.DE as DE
  46 --import qualified Gargantext.Text.Samples.SP as SP
  47 --import qualified Gargantext.Text.Samples.CH as CH
  48
  49 ------------------------------------------------------------------------
  50 data Candidate = Candidate { stop :: Double
  51                            , noStop :: Double
  52  } deriving (Show)
  53
  54 ------------------------------------------------------------------------
  55 -- * Analyze candidate
  56 type StringSize = Int
  57 type TotalFreq  = Int
  58 type Freq       = Int
  59 type Word       = String
  60
  61 data CatWord a = CatWord a Word
  62 type CatProb a = Map     a Double
  63
  64 type Events a = Map a EventBook
  65 ------------------------------------------------------------------------
  66 data EventBook = EventBook { events_freq :: Map String     Freq
  67                            , events_n    :: Map StringSize TotalFreq
  68                            }
  69                              deriving (Show, Generic)
  70
  71 instance Serialise EventBook
  72
  73 instance (Serialise a, Ord a) => SaveFile (Events a) where
  74   saveFile' f d = BSL.writeFile f (serialise d)
  75
  76 instance (Serialise a, Ord a) => ReadFile (Events a) where
  77   readFile' filepath = deserialise <$> BSL.readFile filepath
  78
  79 ------------------------------------------------------------------------
  80 detectStopDefault :: Text -> Maybe Bool
  81 detectStopDefault = undefined
  82
  83 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  84 detectBool events = detectDefault False events
  85
  86 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  87 detectDefault = detectDefaultWith identity
  88
  89 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  90 detectDefaultWith f d events = detectDefaultWithPriors f ps
  91   where
  92     ps = priorEventsWith f d events
  93
  94 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  95 detectDefaultWithPriors f priors = detectCat 99 priors . f
  96
  97 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
  98 priorEventsWith f d e = toEvents d [0..2] 10 es
  99   where
 100     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
 101
 102
 103 ------------------------------------------------------------------------
 104 detectLangDefault :: Text -> Maybe Lang
 105 detectLangDefault = detectCat 99 eventLang
 106   where
 107     eventLang :: Events Lang
 108     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
 109
 110     langWord :: Lang -> CatWord Lang
 111     langWord l = CatWord l (textSample l)
 112
 113     textSample :: Lang -> String
 114     textSample EN = EN.textSample
 115     textSample FR = FR.textSample
 116     textSample _  = panic "textSample: not impl yet"
 117     --textSample DE = DE.textSample
 118     --textSample SP = SP.textSample
 119     --textSample CH = CH.textSample
 120 ------------------------------------------------------------------------
 121 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 122 detectCat n es = head . map fst . (detectCat' n es) . unpack
 123   where
 124     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 125     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 126                                 $ toList
 127                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 128
 129
 130     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 131     detectWith n'' el (EventBook mapFreq _) =
 132       DM.unionsWith (+)
 133       $ map DM.fromList
 134       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 135       $ filter (\x -> fst x /= "  ")
 136       $ DM.toList mapFreq
 137
 138     -- | TODO: monoids (but proba >= 0)
 139     toPrior :: Int -> String -> Events a -> [(a, Double)]
 140     toPrior n'' s el = prior n'' $ pebLang s el
 141       where
 142         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 143         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 144
 145         peb :: String -> EventBook -> (Freq, TotalFreq)
 146         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 147           where
 148             a = maybe 0 identity $ DM.lookup st mapFreq
 149             b = maybe 1 identity $ DM.lookup (length st) mapN
 150
 151
 152     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 153     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 154                                     (map (\(a,b) -> a / b) ps')
 155       where
 156         (ls, ps'') = DL.unzip ps
 157         ps' = map (both fromIntegral) ps''
 158
 159     part :: (Eq p, Fractional p) => p -> p -> p
 160     part 0 _ = 0
 161     part _ 0 = 0
 162     part x y = x / y
 163
 164 {-
 165 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 166                  t (a, b) -> t (a, b)
 167 toProba xs = map (\(a,b) -> (a, part b total)) xs
 168   where
 169     total = sum $ map snd xs
 170 -}
 171 -- | TODO: monoids
 172 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 173 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 174   where
 175     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 176     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 177
 178     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 179     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 180
 181     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 182     opEvent f = DM.unionWith (op f)
 183
 184 ------------------------------------------------------------------------
 185
 186 emptyEventBook :: [Int] -> Int -> EventBook
 187 emptyEventBook ns n = wordToBook ns n " "
 188
 189 wordsToBook :: [Int] -> Int -> String -> EventBook
 190 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 191   where
 192     ws = map unpack $ words $ pack txt
 193     eventsBook = map (wordToBook ns n) ws
 194
 195 wordToBook :: [Int] -> Int -> Word -> EventBook
 196 wordToBook ns n txt = EventBook ef en
 197   where
 198     chks = allChunks ns n txt
 199     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 200     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 201
 202 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 203 op f (EventBook ef1 en1)
 204      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 205                                      (DM.unionWith f en1 en2)
 206
 207 ------------------------------------------------------------------------
 208 ------------------------------------------------------------------------
 209 allChunks :: [Int] -> Int -> String -> [[String]]
 210 allChunks ns m st = map (\n -> chunks n m st) ns
 211
 212 -- | Chunks is the same function as splitBy in Context but for Strings,
 213 -- not Text (without pack and unpack operations that are not needed).
 214 chunks :: Int -> Int -> String -> [String]
 215 chunks n m = DL.take m . filter (not . all (== ' '))
 216                        . chunkAlong (n+1) 1
 217                        . DL.concat
 218                        . DL.take 1000
 219                        . DL.repeat
 220                        . blanks
 221
 222 -- | String preparation
 223 blanks :: String -> String
 224 blanks [] = []
 225 blanks xs = [' '] <> xs <> [' ']
 226
 227
 228 {-
 229 -- Some previous tests to be removed
 230 --import GHC.Base (Functor)
 231 --import Numeric.Probability.Distribution ((??))
 232 --import qualified Numeric.Probability.Distribution as D
 233
 234 -- | Blocks increase the size of the word to ease computations
 235 -- some border and unexepected effects can happen, need to be tested
 236 blockOf :: Int -> String -> String
 237 blockOf n = DL.concat . DL.take n . DL.repeat
 238
 239 -- * Make the distributions
 240 makeDist :: [String] -> D.T Double String
 241 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 242
 243 stopDist :: D.T Double String
 244 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 245
 246 candDist :: D.T Double String
 247 candDist = makeDist candList
 248
 249 ------------------------------------------------------------------------
 250 sumProba :: Num a => D.T a String -> [Char] -> a
 251 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 252
 253 -- | Get probability according a distribution
 254 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 255 (~?) ds x = (==x) ?? ds
 256
 257 ------------------------------------------------------------------------
 258 candidate :: [Char] -> Candidate
 259 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 260
 261 ------------------------------------------------------------------------
 262 candList :: [String]
 263 candList = [ "france", "alexandre", "mael", "constitution"
 264            , "etats-unis", "associes", "car", "train", "spam"]
 265
 266 --}