src/Gargantext/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12
  13 Stop words and (how to learn it).
  14
  15 Main type here is String.
  16
  17 -}
  18
  19 {-# LANGUAGE NoImplicitPrelude #-}
  20 {-# LANGUAGE OverloadedStrings #-}
  21 {-# LANGUAGE DeriveGeneric #-}
  22 {-# LANGUAGE TypeSynonymInstances #-}
  23 {-# LANGUAGE FlexibleInstances #-}
  24
  25 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
  26   where
  27
  28 import Codec.Serialise
  29 import qualified Data.List as DL
  30
  31 import Data.Maybe (maybe)
  32 import Data.Map.Strict (Map, toList)
  33 import qualified Data.Map.Strict as DM
  34
  35 import GHC.Generics
  36 import Data.String (String)
  37
  38 import Data.Text (Text)
  39 import Data.Text (pack, unpack, toLower)
  40 import Data.Tuple.Extra (both)
  41 import qualified Data.ByteString.Lazy as BSL
  42
  43 import Gargantext.Prelude
  44 import Gargantext.Prelude.Utils
  45 import Gargantext.Core (Lang(..), allLangs)
  46 import Gargantext.Text.Terms.Mono (words)
  47 import Gargantext.Text.Metrics.Count (occurrencesWith)
  48
  49 import qualified Gargantext.Text.Samples.FR as FR
  50 import qualified Gargantext.Text.Samples.EN as EN
  51 --import qualified Gargantext.Text.Samples.DE as DE
  52 --import qualified Gargantext.Text.Samples.SP as SP
  53 --import qualified Gargantext.Text.Samples.CH as CH
  54
  55 ------------------------------------------------------------------------
  56 data Candidate = Candidate { stop :: Double
  57                            , noStop :: Double
  58  } deriving (Show)
  59
  60 ------------------------------------------------------------------------
  61 -- * Analyze candidate
  62 type StringSize = Int
  63 type TotalFreq  = Int
  64 type Freq       = Int
  65 type Word       = String
  66
  67 data CatWord a = CatWord a Word
  68 type CatProb a = Map     a Double
  69
  70 type Events a = Map a EventBook
  71
  72 ------------------------------------------------------------------------
  73 detectStopDefault :: Text -> Maybe Bool
  74 detectStopDefault = undefined
  75
  76 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  77 detectBool events = detectDefault False events
  78
  79 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  80 detectDefault = detectDefaultWith identity
  81
  82 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  83 detectDefaultWith f d events = detectDefaultWithPriors f ps
  84   where
  85     ps = priorEventsWith f d events
  86
  87 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  88 detectDefaultWithPriors f priors = detectCat 99 priors . f
  89
  90 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
  91 priorEventsWith f d e = toEvents d [0..2] 10 es
  92   where
  93     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
  94
  95
  96 ------------------------------------------------------------------------
  97 detectLangDefault :: Text -> Maybe Lang
  98 detectLangDefault = detectCat 99 eventLang
  99   where
 100     eventLang :: Events Lang
 101     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
 102
 103     langWord :: Lang -> CatWord Lang
 104     langWord l = CatWord l (textSample l)
 105
 106     textSample :: Lang -> String
 107     textSample EN = EN.textSample
 108     textSample FR = FR.textSample
 109     --textSample DE = DE.textSample
 110     --textSample SP = SP.textSample
 111     --textSample CH = CH.textSample
 112 ------------------------------------------------------------------------
 113 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 114 detectCat n es = head . map fst . (detectCat' n es) . unpack
 115   where
 116     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 117     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 118                                 $ toList
 119                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 120
 121
 122     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 123     detectWith n'' el (EventBook mapFreq _) =
 124       DM.unionsWith (+)
 125       $ map DM.fromList
 126       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 127       $ filter (\x -> fst x /= "  ")
 128       $ DM.toList mapFreq
 129
 130     -- | TODO: monoids (but proba >= 0)
 131     toPrior :: Int -> String -> Events a -> [(a, Double)]
 132     toPrior n'' s el = prior n'' $ pebLang s el
 133       where
 134         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 135         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 136
 137         peb :: String -> EventBook -> (Freq, TotalFreq)
 138         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 139           where
 140             a = maybe 0 identity $ DM.lookup st mapFreq
 141             b = maybe 1 identity $ DM.lookup (length st) mapN
 142
 143
 144     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 145     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 146                                     (map (\(a,b) -> a / b) ps')
 147       where
 148         (ls, ps'') = DL.unzip ps
 149         ps' = map (both fromIntegral) ps''
 150
 151     part :: (Eq p, Fractional p) => p -> p -> p
 152     part 0 _ = 0
 153     part _ 0 = 0
 154     part x y = x / y
 155
 156 {-
 157 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 158                  t (a, b) -> t (a, b)
 159 toProba xs = map (\(a,b) -> (a, part b total)) xs
 160   where
 161     total = sum $ map snd xs
 162 -}
 163 -- | TODO: monoids
 164 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 165 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 166   where
 167     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 168     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 169
 170     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 171     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 172
 173     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 174     opEvent f = DM.unionWith (op f)
 175
 176 ------------------------------------------------------------------------
 177 ------------------------------------------------------------------------
 178 data EventBook = EventBook { events_freq :: Map String     Freq
 179                            , events_n    :: Map StringSize TotalFreq
 180                            }
 181                              deriving (Show, Generic)
 182
 183 instance Serialise EventBook
 184
 185 instance (Serialise a, Ord a) => SaveFile (Events a) where
 186   saveFile' f d = BSL.writeFile f (serialise d)
 187
 188 instance (Serialise a, Ord a) => ReadFile (Events a) where
 189   readFile' filepath = deserialise <$> BSL.readFile filepath
 190
 191
 192 emptyEventBook :: [Int] -> Int -> EventBook
 193 emptyEventBook ns n = wordToBook ns n " "
 194
 195 wordsToBook :: [Int] -> Int -> String -> EventBook
 196 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 197   where
 198     ws = map unpack $ words $ pack txt
 199     eventsBook = map (wordToBook ns n) ws
 200
 201 wordToBook :: [Int] -> Int -> Word -> EventBook
 202 wordToBook ns n txt = EventBook ef en
 203   where
 204     chks = allChunks ns n txt
 205     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 206     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 207
 208 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 209 op f (EventBook ef1 en1)
 210      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 211                                      (DM.unionWith f en1 en2)
 212
 213 ------------------------------------------------------------------------
 214 ------------------------------------------------------------------------
 215 allChunks :: [Int] -> Int -> String -> [[String]]
 216 allChunks ns m st = map (\n -> chunks n m st) ns
 217
 218 -- | Chunks is the same function as splitBy in Context but for Strings,
 219 -- not Text (without pack and unpack operations that are not needed).
 220 chunks :: Int -> Int -> String -> [String]
 221 chunks n m = DL.take m . filter (not . all (== ' '))
 222                        . chunkAlong (n+1) 1
 223                        . DL.concat
 224                        . DL.take 1000
 225                        . DL.repeat
 226                        . blanks
 227
 228 -- | String preparation
 229 blanks :: String -> String
 230 blanks [] = []
 231 blanks xs = [' '] <> xs <> [' ']
 232
 233
 234 {-
 235 -- Some previous tests to be removed
 236 --import GHC.Base (Functor)
 237 --import Numeric.Probability.Distribution ((??))
 238 --import qualified Numeric.Probability.Distribution as D
 239
 240 -- | Blocks increase the size of the word to ease computations
 241 -- some border and unexepected effects can happen, need to be tested
 242 blockOf :: Int -> String -> String
 243 blockOf n = DL.concat . DL.take n . DL.repeat
 244
 245 -- * Make the distributions
 246 makeDist :: [String] -> D.T Double String
 247 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 248
 249 stopDist :: D.T Double String
 250 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 251
 252 candDist :: D.T Double String
 253 candDist = makeDist candList
 254
 255 ------------------------------------------------------------------------
 256 sumProba :: Num a => D.T a String -> [Char] -> a
 257 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 258
 259 -- | Get probability according a distribution
 260 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 261 (~?) ds x = (==x) ?? ds
 262
 263 ------------------------------------------------------------------------
 264 candidate :: [Char] -> Candidate
 265 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 266
 267 ------------------------------------------------------------------------
 268 candList :: [String]
 269 candList = [ "france", "alexandre", "mael", "constitution"
 270            , "etats-unis", "associes", "car", "train", "spam"]
 271
 272 --}