]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Learn.hs
[scrapers] fix limit with MAX_DOCS_SCRAPERS
[gargantext.git] / src / Gargantext / Core / Text / Learn.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
14
15 -}
16
17 {-# LANGUAGE TypeSynonymInstances #-}
18
19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
20 where
21
22 import Codec.Serialise
23 import qualified Data.List as DL
24
25 import Data.Map.Strict (Map, toList)
26 import qualified Data.Map.Strict as DM
27
28 import GHC.Generics
29 import Data.String (String)
30
31 import Data.Text (Text)
32 import Data.Text (pack, unpack, toLower)
33 import Data.Tuple.Extra (both)
34 import qualified Data.ByteString.Lazy as BSL
35
36 import Gargantext.Prelude
37 import Gargantext.Database.GargDB
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Core.Text.Terms.Mono (words)
40 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
41
42 import qualified Gargantext.Core.Text.Samples.FR as FR
43 import qualified Gargantext.Core.Text.Samples.EN as EN
44 --import qualified Gargantext.Core.Text.Samples.DE as DE
45 --import qualified Gargantext.Core.Text.Samples.SP as SP
46 --import qualified Gargantext.Core.Text.Samples.CH as CH
47
48 ------------------------------------------------------------------------
49 data Candidate = Candidate { stop :: Double
50 , noStop :: Double
51 } deriving (Show)
52
53 ------------------------------------------------------------------------
54 -- * Analyze candidate
55 type StringSize = Int
56 type TotalFreq = Int
57 type Freq = Int
58 type Word = String
59
60 data CatWord a = CatWord a Word
61 type CatProb a = Map a Double
62
63 type Events a = Map a EventBook
64 ------------------------------------------------------------------------
65 data EventBook = EventBook { events_freq :: Map String Freq
66 , events_n :: Map StringSize TotalFreq
67 }
68 deriving (Show, Generic)
69
70 instance Serialise EventBook
71
72 instance (Serialise a, Ord a) => SaveFile (Events a) where
73 saveFile' f d = BSL.writeFile f (serialise d)
74
75 instance (Serialise a, Ord a) => ReadFile (Events a) where
76 readFile' filepath = deserialise <$> BSL.readFile filepath
77
78 ------------------------------------------------------------------------
79 detectStopDefault :: Text -> Maybe Bool
80 detectStopDefault = undefined
81
82 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
83 detectBool events = detectDefault False events
84
85 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
86 detectDefault = detectDefaultWith identity
87
88 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
89 detectDefaultWith f d events = detectDefaultWithPriors f ps
90 where
91 ps = priorEventsWith f d events
92
93 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
94 detectDefaultWithPriors f priors = detectCat 99 priors . f
95
96 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
97 priorEventsWith f d e = toEvents d [0..2] 10 es
98 where
99 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
100
101
102 ------------------------------------------------------------------------
103 detectLangDefault :: Text -> Maybe Lang
104 detectLangDefault = detectCat 99 eventLang
105 where
106 eventLang :: Events Lang
107 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
108
109 langWord :: Lang -> CatWord Lang
110 langWord l = CatWord l (textSample l)
111
112 textSample :: Lang -> String
113 textSample EN = EN.textSample
114 textSample FR = FR.textSample
115 textSample _ = panic "[G.C.T.L:detectLangDefault] not impl yet"
116 --textSample DE = DE.textSample
117 --textSample SP = SP.textSample
118 --textSample CH = CH.textSample
119 ------------------------------------------------------------------------
120 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
121 detectCat n es = head . map fst . (detectCat' n es) . unpack
122 where
123 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
124 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
125 $ toList
126 $ detectWith n' es' (wordsToBook [0..2] n' s)
127
128
129 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
130 detectWith n'' el (EventBook mapFreq _) =
131 DM.unionsWith (+)
132 $ map DM.fromList
133 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
134 $ filter (\x -> fst x /= " ")
135 $ DM.toList mapFreq
136
137 -- | TODO: monoids (but proba >= 0)
138 toPrior :: Int -> String -> Events a -> [(a, Double)]
139 toPrior n'' s el = prior n'' $ pebLang s el
140 where
141 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
142 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
143
144 peb :: String -> EventBook -> (Freq, TotalFreq)
145 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
146 where
147 a = maybe 0 identity $ DM.lookup st mapFreq
148 b = maybe 1 identity $ DM.lookup (length st) mapN
149
150
151 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
152 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
153 (map (\(a,b) -> a / b) ps')
154 where
155 (ls, ps'') = DL.unzip ps
156 ps' = map (both fromIntegral) ps''
157
158 part :: (Eq p, Fractional p) => p -> p -> p
159 part 0 _ = 0
160 part _ 0 = 0
161 part x y = x / y
162
163 {-
164 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
165 t (a, b) -> t (a, b)
166 toProba xs = map (\(a,b) -> (a, part b total)) xs
167 where
168 total = sum $ map snd xs
169 -}
170 -- | TODO: monoids
171 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
172 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
173 where
174 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
175 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
176
177 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
178 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
179
180 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
181 opEvent f = DM.unionWith (op f)
182
183 ------------------------------------------------------------------------
184
185 emptyEventBook :: [Int] -> Int -> EventBook
186 emptyEventBook ns n = wordToBook ns n " "
187
188 wordsToBook :: [Int] -> Int -> String -> EventBook
189 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
190 where
191 ws = map unpack $ words $ pack txt
192 eventsBook = map (wordToBook ns n) ws
193
194 wordToBook :: [Int] -> Int -> Word -> EventBook
195 wordToBook ns n txt = EventBook ef en
196 where
197 chks = allChunks ns n txt
198 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
199 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
200
201 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
202 op f (EventBook ef1 en1)
203 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
204 (DM.unionWith f en1 en2)
205
206 ------------------------------------------------------------------------
207 ------------------------------------------------------------------------
208 allChunks :: [Int] -> Int -> String -> [[String]]
209 allChunks ns m st = map (\n -> chunks n m st) ns
210
211 -- | Chunks is the same function as splitBy in Context but for Strings,
212 -- not Text (without pack and unpack operations that are not needed).
213 chunks :: Int -> Int -> String -> [String]
214 chunks n m = DL.take m . filter (not . all (== ' '))
215 . chunkAlong (n+1) 1
216 . DL.concat
217 . DL.take 1000
218 . DL.repeat
219 . blanks
220
221 -- | String preparation
222 blanks :: String -> String
223 blanks [] = []
224 blanks xs = [' '] <> xs <> [' ']
225
226
227 {-
228 -- Some previous tests to be removed
229 --import GHC.Base (Functor)
230 --import Numeric.Probability.Distribution ((??))
231 --import qualified Numeric.Probability.Distribution as D
232
233 -- | Blocks increase the size of the word to ease computations
234 -- some border and unexepected effects can happen, need to be tested
235 blockOf :: Int -> String -> String
236 blockOf n = DL.concat . DL.take n . DL.repeat
237
238 -- * Make the distributions
239 makeDist :: [String] -> D.T Double String
240 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
241
242 stopDist :: D.T Double String
243 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
244
245 candDist :: D.T Double String
246 candDist = makeDist candList
247
248 ------------------------------------------------------------------------
249 sumProba :: Num a => D.T a String -> [Char] -> a
250 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
251
252 -- | Get probability according a distribution
253 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
254 (~?) ds x = (==x) ?? ds
255
256 ------------------------------------------------------------------------
257 candidate :: [Char] -> Candidate
258 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
259
260 ------------------------------------------------------------------------
261 candList :: [String]
262 candList = [ "france", "alexandre", "mael", "constitution"
263 , "etats-unis", "associes", "car", "train", "spam"]
264
265 --}