src/Gargantext/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 TODO:
  11 - generalize to byteString
  12 - Stop words and (how to learn it).
  13 - Main type here is String check if Chars on Text would be optimized
  14
  15 -}
  16
  17 {-# LANGUAGE NoImplicitPrelude #-}
  18 {-# LANGUAGE OverloadedStrings #-}
  19 {-# LANGUAGE DeriveGeneric #-}
  20 {-# LANGUAGE TypeSynonymInstances #-}
  21 {-# LANGUAGE FlexibleInstances #-}
  22
  23 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
  24   where
  25
  26 import Codec.Serialise
  27 import qualified Data.List as DL
  28
  29 import Data.Maybe (maybe)
  30 import Data.Map.Strict (Map, toList)
  31 import qualified Data.Map.Strict as DM
  32
  33 import GHC.Generics
  34 import Data.String (String)
  35
  36 import Data.Text (Text)
  37 import Data.Text (pack, unpack, toLower)
  38 import Data.Tuple.Extra (both)
  39 import qualified Data.ByteString.Lazy as BSL
  40
  41 import Gargantext.Prelude
  42 import Gargantext.Prelude.Utils
  43 import Gargantext.Core (Lang(..), allLangs)
  44 import Gargantext.Text.Terms.Mono (words)
  45 import Gargantext.Text.Metrics.Count (occurrencesWith)
  46
  47 import qualified Gargantext.Text.Samples.FR as FR
  48 import qualified Gargantext.Text.Samples.EN as EN
  49 --import qualified Gargantext.Text.Samples.DE as DE
  50 --import qualified Gargantext.Text.Samples.SP as SP
  51 --import qualified Gargantext.Text.Samples.CH as CH
  52
  53 ------------------------------------------------------------------------
  54 data Candidate = Candidate { stop :: Double
  55                            , noStop :: Double
  56  } deriving (Show)
  57
  58 ------------------------------------------------------------------------
  59 -- * Analyze candidate
  60 type StringSize = Int
  61 type TotalFreq  = Int
  62 type Freq       = Int
  63 type Word       = String
  64
  65 data CatWord a = CatWord a Word
  66 type CatProb a = Map     a Double
  67
  68 type Events a = Map a EventBook
  69 ------------------------------------------------------------------------
  70 data EventBook = EventBook { events_freq :: Map String     Freq
  71                            , events_n    :: Map StringSize TotalFreq
  72                            }
  73                              deriving (Show, Generic)
  74
  75 instance Serialise EventBook
  76
  77 instance (Serialise a, Ord a) => SaveFile (Events a) where
  78   saveFile' f d = BSL.writeFile f (serialise d)
  79
  80 instance (Serialise a, Ord a) => ReadFile (Events a) where
  81   readFile' filepath = deserialise <$> BSL.readFile filepath
  82
  83 ------------------------------------------------------------------------
  84 detectStopDefault :: Text -> Maybe Bool
  85 detectStopDefault = undefined
  86
  87 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
  88 detectBool events = detectDefault False events
  89
  90 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
  91 detectDefault = detectDefaultWith identity
  92
  93 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
  94 detectDefaultWith f d events = detectDefaultWithPriors f ps
  95   where
  96     ps = priorEventsWith f d events
  97
  98 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
  99 detectDefaultWithPriors f priors = detectCat 99 priors . f
 100
 101 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
 102 priorEventsWith f d e = toEvents d [0..2] 10 es
 103   where
 104     es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
 105
 106
 107 ------------------------------------------------------------------------
 108 detectLangDefault :: Text -> Maybe Lang
 109 detectLangDefault = detectCat 99 eventLang
 110   where
 111     eventLang :: Events Lang
 112     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
 113
 114     langWord :: Lang -> CatWord Lang
 115     langWord l = CatWord l (textSample l)
 116
 117     textSample :: Lang -> String
 118     textSample EN = EN.textSample
 119     textSample FR = FR.textSample
 120     --textSample DE = DE.textSample
 121     --textSample SP = SP.textSample
 122     --textSample CH = CH.textSample
 123 ------------------------------------------------------------------------
 124 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
 125 detectCat n es = head . map fst . (detectCat' n es) . unpack
 126   where
 127     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
 128     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
 129                                 $ toList
 130                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
 131
 132
 133     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
 134     detectWith n'' el (EventBook mapFreq _) =
 135       DM.unionsWith (+)
 136       $ map DM.fromList
 137       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 138       $ filter (\x -> fst x /= "  ")
 139       $ DM.toList mapFreq
 140
 141     -- | TODO: monoids (but proba >= 0)
 142     toPrior :: Int -> String -> Events a -> [(a, Double)]
 143     toPrior n'' s el = prior n'' $ pebLang s el
 144       where
 145         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 146         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 147
 148         peb :: String -> EventBook -> (Freq, TotalFreq)
 149         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 150           where
 151             a = maybe 0 identity $ DM.lookup st mapFreq
 152             b = maybe 1 identity $ DM.lookup (length st) mapN
 153
 154
 155     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 156     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 157                                     (map (\(a,b) -> a / b) ps')
 158       where
 159         (ls, ps'') = DL.unzip ps
 160         ps' = map (both fromIntegral) ps''
 161
 162     part :: (Eq p, Fractional p) => p -> p -> p
 163     part 0 _ = 0
 164     part _ 0 = 0
 165     part x y = x / y
 166
 167 {-
 168 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 169                  t (a, b) -> t (a, b)
 170 toProba xs = map (\(a,b) -> (a, part b total)) xs
 171   where
 172     total = sum $ map snd xs
 173 -}
 174 -- | TODO: monoids
 175 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 176 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 177   where
 178     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 179     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 180
 181     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 182     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 183
 184     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 185     opEvent f = DM.unionWith (op f)
 186
 187 ------------------------------------------------------------------------
 188
 189 emptyEventBook :: [Int] -> Int -> EventBook
 190 emptyEventBook ns n = wordToBook ns n " "
 191
 192 wordsToBook :: [Int] -> Int -> String -> EventBook
 193 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 194   where
 195     ws = map unpack $ words $ pack txt
 196     eventsBook = map (wordToBook ns n) ws
 197
 198 wordToBook :: [Int] -> Int -> Word -> EventBook
 199 wordToBook ns n txt = EventBook ef en
 200   where
 201     chks = allChunks ns n txt
 202     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 203     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 204
 205 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 206 op f (EventBook ef1 en1)
 207      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 208                                      (DM.unionWith f en1 en2)
 209
 210 ------------------------------------------------------------------------
 211 ------------------------------------------------------------------------
 212 allChunks :: [Int] -> Int -> String -> [[String]]
 213 allChunks ns m st = map (\n -> chunks n m st) ns
 214
 215 -- | Chunks is the same function as splitBy in Context but for Strings,
 216 -- not Text (without pack and unpack operations that are not needed).
 217 chunks :: Int -> Int -> String -> [String]
 218 chunks n m = DL.take m . filter (not . all (== ' '))
 219                        . chunkAlong (n+1) 1
 220                        . DL.concat
 221                        . DL.take 1000
 222                        . DL.repeat
 223                        . blanks
 224
 225 -- | String preparation
 226 blanks :: String -> String
 227 blanks [] = []
 228 blanks xs = [' '] <> xs <> [' ']
 229
 230
 231 {-
 232 -- Some previous tests to be removed
 233 --import GHC.Base (Functor)
 234 --import Numeric.Probability.Distribution ((??))
 235 --import qualified Numeric.Probability.Distribution as D
 236
 237 -- | Blocks increase the size of the word to ease computations
 238 -- some border and unexepected effects can happen, need to be tested
 239 blockOf :: Int -> String -> String
 240 blockOf n = DL.concat . DL.take n . DL.repeat
 241
 242 -- * Make the distributions
 243 makeDist :: [String] -> D.T Double String
 244 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 245
 246 stopDist :: D.T Double String
 247 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 248
 249 candDist :: D.T Double String
 250 candDist = makeDist candList
 251
 252 ------------------------------------------------------------------------
 253 sumProba :: Num a => D.T a String -> [Char] -> a
 254 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 255
 256 -- | Get probability according a distribution
 257 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 258 (~?) ds x = (==x) ?? ds
 259
 260 ------------------------------------------------------------------------
 261 candidate :: [Char] -> Candidate
 262 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 263
 264 ------------------------------------------------------------------------
 265 candList :: [String]
 266 candList = [ "france", "alexandre", "mael", "constitution"
 267            , "etats-unis", "associes", "car", "train", "spam"]
 268
 269 --}