src/Gargantext/Text/Learn.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Stop words and (how to learn it).
  11
  12 Main type here is String.
  13
  14 -}
  15
  16 {-# LANGUAGE NoImplicitPrelude #-}
  17 {-# LANGUAGE OverloadedStrings #-}
  18
  19 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
  20   where
  21
  22 --import Data.Char (toLower)
  23 import qualified Data.List as DL
  24
  25 import Data.Maybe (maybe)
  26 import Data.Map.Strict (Map, toList)
  27 import qualified Data.Map.Strict as DM
  28
  29 import Data.String (String)
  30
  31 import Data.Text (Text)
  32 import Data.Text (pack, unpack, toLower)
  33 import Data.Tuple.Extra (both)
  34
  35 import Gargantext.Prelude
  36 import Gargantext.Core (Lang(..), allLangs)
  37 import Gargantext.Text.Terms.Mono (words)
  38 import Gargantext.Text.Metrics.Count (occurrencesWith)
  39
  40 import qualified Gargantext.Text.Samples.FR as FR
  41 import qualified Gargantext.Text.Samples.EN as EN
  42 --import qualified Gargantext.Text.Samples.DE as DE
  43 --import qualified Gargantext.Text.Samples.SP as SP
  44 --import qualified Gargantext.Text.Samples.CH as CH
  45
  46 ------------------------------------------------------------------------
  47 data Candidate = Candidate { stop :: Double
  48                            , noStop :: Double
  49  } deriving (Show)
  50
  51 ------------------------------------------------------------------------
  52 -- * Analyze candidate
  53 type StringSize = Int
  54 type TotalFreq  = Int
  55 type Freq       = Int
  56 type Word       = String
  57
  58 data CatWord a = CatWord a Word
  59 type CatProb a = Map     a Double
  60
  61 type Events a = Map a EventBook
  62
  63 ------------------------------------------------------------------------
  64 detectLangDefault :: Text -> Maybe Lang
  65 detectLangDefault = detectCat 99 eventLang
  66   where
  67     eventLang :: Events Lang
  68     eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
  69
  70     langWord :: Lang -> CatWord Lang
  71     langWord l = CatWord l (textSample l)
  72
  73     textSample :: Lang -> String
  74     textSample EN = EN.textSample
  75     textSample FR = FR.textSample
  76     --textSample DE = DE.textSample
  77     --textSample SP = SP.textSample
  78     --textSample CH = CH.textSample
  79
  80 detectStopDefault :: Text -> Maybe Bool
  81 detectStopDefault = undefined
  82
  83 detectDefault :: [(Bool, Text)] -> Text -> Maybe Bool
  84 detectDefault events = detectCat 99 (priorEvents events)
  85   where
  86     priorEvents events' = toEvents True [0..2] 10 (map (\(a,b) -> CatWord a (unpack $ toLower b)) events')
  87
  88 ------------------------------------------------------------------------
  89 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
  90 detectCat n es = head . map fst . (detectCat' n es) . unpack
  91   where
  92     detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
  93     detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
  94                                 $ toList
  95                                 $ detectWith n' es' (wordsToBook [0..2] n' s)
  96
  97
  98     detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
  99     detectWith n'' el (EventBook mapFreq _) =
 100       DM.unionsWith (+)
 101       $ map DM.fromList
 102       $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
 103       $ filter (\x -> fst x /= "  ")
 104       $ DM.toList mapFreq
 105
 106     -- | TODO: monoids (but proba >= 0)
 107     toPrior :: Int -> String -> Events a -> [(a, Double)]
 108     toPrior n'' s el = prior n'' $ pebLang s el
 109       where
 110         pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
 111         pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
 112
 113         peb :: String -> EventBook -> (Freq, TotalFreq)
 114         peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 115           where
 116             a = maybe 0 identity $ DM.lookup st mapFreq
 117             b = maybe 1 identity $ DM.lookup (length st) mapN
 118
 119
 120     prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
 121     prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 122                                     (map (\(a,b) -> a / b) ps')
 123       where
 124         (ls, ps'') = DL.unzip ps
 125         ps' = map (both fromIntegral) ps''
 126
 127     part :: (Eq p, Fractional p) => p -> p -> p
 128     part 0 _ = 0
 129     part _ 0 = 0
 130     part x y = x / y
 131
 132 {-
 133 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 134                  t (a, b) -> t (a, b)
 135 toProba xs = map (\(a,b) -> (a, part b total)) xs
 136   where
 137     total = sum $ map snd xs
 138 -}
 139 -- | TODO: monoids
 140 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
 141 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
 142   where
 143     emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
 144     emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
 145
 146     toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
 147     toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
 148
 149     opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
 150     opEvent f = DM.unionWith (op f)
 151
 152 ------------------------------------------------------------------------
 153 ------------------------------------------------------------------------
 154 data EventBook = EventBook { events_freq :: Map String     Freq
 155                            , events_n    :: Map StringSize TotalFreq
 156                            }
 157                              deriving (Show)
 158
 159 emptyEventBook :: [Int] -> Int -> EventBook
 160 emptyEventBook ns n = wordToBook ns n " "
 161
 162 wordsToBook :: [Int] -> Int -> String -> EventBook
 163 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
 164   where
 165     ws = map unpack $ words $ pack txt
 166     eventsBook = map (wordToBook ns n) ws
 167
 168 wordToBook :: [Int] -> Int -> Word -> EventBook
 169 wordToBook ns n txt = EventBook ef en
 170   where
 171     chks = allChunks ns n txt
 172     en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
 173     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 174
 175 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 176 op f (EventBook ef1 en1)
 177      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 178                                      (DM.unionWith f en1 en2)
 179
 180 ------------------------------------------------------------------------
 181 ------------------------------------------------------------------------
 182 allChunks :: [Int] -> Int -> String -> [[String]]
 183 allChunks ns m st = map (\n -> chunks n m st) ns
 184
 185 -- | Chunks is the same function as splitBy in Context but for Strings,
 186 -- not Text (without pack and unpack operations that are not needed).
 187 chunks :: Int -> Int -> String -> [String]
 188 chunks n m = DL.take m . filter (not . all (== ' '))
 189                        . chunkAlong (n+1) 1
 190                        . DL.concat
 191                        . DL.take 1000
 192                        . DL.repeat
 193                        . blanks
 194
 195 -- | String preparation
 196 blanks :: String -> String
 197 blanks [] = []
 198 blanks xs = [' '] <> xs <> [' ']
 199
 200
 201 {-
 202 -- Some previous tests to be removed
 203 --import GHC.Base (Functor)
 204 --import Numeric.Probability.Distribution ((??))
 205 --import qualified Numeric.Probability.Distribution as D
 206
 207 -- | Blocks increase the size of the word to ease computations
 208 -- some border and unexepected effects can happen, need to be tested
 209 blockOf :: Int -> String -> String
 210 blockOf n = DL.concat . DL.take n . DL.repeat
 211
 212 -- * Make the distributions
 213 makeDist :: [String] -> D.T Double String
 214 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 215
 216 stopDist :: D.T Double String
 217 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 218
 219 candDist :: D.T Double String
 220 candDist = makeDist candList
 221
 222 ------------------------------------------------------------------------
 223 sumProba :: Num a => D.T a String -> [Char] -> a
 224 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 225
 226 -- | Get probability according a distribution
 227 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 228 (~?) ds x = (==x) ?? ds
 229
 230 ------------------------------------------------------------------------
 231 candidate :: [Char] -> Candidate
 232 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 233
 234 ------------------------------------------------------------------------
 235 candList :: [String]
 236 candList = [ "france", "alexandre", "mael", "constitution"
 237            , "etats-unis", "associes", "car", "train", "spam"]
 238
 239 --}