]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Learn.hs
[FEAT] Learn function to export models
[gargantext.git] / src / Gargantext / Text / Learn.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12
13 Stop words and (how to learn it).
14
15 Main type here is String.
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE TypeSynonymInstances #-}
23 {-# LANGUAGE FlexibleInstances #-}
24
25 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
26 where
27
28 import Codec.Serialise
29 import qualified Data.List as DL
30
31 import Data.Maybe (maybe)
32 import Data.Map.Strict (Map, toList)
33 import qualified Data.Map.Strict as DM
34
35 import GHC.Generics
36 import Data.String (String)
37
38 import Data.Text (Text)
39 import Data.Text (pack, unpack, toLower)
40 import Data.Tuple.Extra (both)
41 import qualified Data.ByteString.Lazy as BSL
42
43 import Gargantext.Prelude
44 import Gargantext.Prelude.Utils
45 import Gargantext.Core (Lang(..), allLangs)
46 import Gargantext.Text.Terms.Mono (words)
47 import Gargantext.Text.Metrics.Count (occurrencesWith)
48
49 import qualified Gargantext.Text.Samples.FR as FR
50 import qualified Gargantext.Text.Samples.EN as EN
51 --import qualified Gargantext.Text.Samples.DE as DE
52 --import qualified Gargantext.Text.Samples.SP as SP
53 --import qualified Gargantext.Text.Samples.CH as CH
54
55 ------------------------------------------------------------------------
56 data Candidate = Candidate { stop :: Double
57 , noStop :: Double
58 } deriving (Show)
59
60 ------------------------------------------------------------------------
61 -- * Analyze candidate
62 type StringSize = Int
63 type TotalFreq = Int
64 type Freq = Int
65 type Word = String
66
67 data CatWord a = CatWord a Word
68 type CatProb a = Map a Double
69
70 type Events a = Map a EventBook
71
72 ------------------------------------------------------------------------
73 detectStopDefault :: Text -> Maybe Bool
74 detectStopDefault = undefined
75
76 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
77 detectBool events = detectDefault False events
78
79 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
80 detectDefault = detectDefaultWith identity
81
82 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
83 detectDefaultWith f d events = detectDefaultWithPriors f ps
84 where
85 ps = priorEventsWith f d events
86
87 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
88 detectDefaultWithPriors f priors = detectCat 99 priors . f
89
90 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
91 priorEventsWith f d e = toEvents d [0..2] 10 es
92 where
93 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
94
95
96 ------------------------------------------------------------------------
97 detectLangDefault :: Text -> Maybe Lang
98 detectLangDefault = detectCat 99 eventLang
99 where
100 eventLang :: Events Lang
101 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
102
103 langWord :: Lang -> CatWord Lang
104 langWord l = CatWord l (textSample l)
105
106 textSample :: Lang -> String
107 textSample EN = EN.textSample
108 textSample FR = FR.textSample
109 --textSample DE = DE.textSample
110 --textSample SP = SP.textSample
111 --textSample CH = CH.textSample
112 ------------------------------------------------------------------------
113 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
114 detectCat n es = head . map fst . (detectCat' n es) . unpack
115 where
116 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
117 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
118 $ toList
119 $ detectWith n' es' (wordsToBook [0..2] n' s)
120
121
122 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
123 detectWith n'' el (EventBook mapFreq _) =
124 DM.unionsWith (+)
125 $ map DM.fromList
126 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
127 $ filter (\x -> fst x /= " ")
128 $ DM.toList mapFreq
129
130 -- | TODO: monoids (but proba >= 0)
131 toPrior :: Int -> String -> Events a -> [(a, Double)]
132 toPrior n'' s el = prior n'' $ pebLang s el
133 where
134 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
135 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
136
137 peb :: String -> EventBook -> (Freq, TotalFreq)
138 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
139 where
140 a = maybe 0 identity $ DM.lookup st mapFreq
141 b = maybe 1 identity $ DM.lookup (length st) mapN
142
143
144 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
145 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
146 (map (\(a,b) -> a / b) ps')
147 where
148 (ls, ps'') = DL.unzip ps
149 ps' = map (both fromIntegral) ps''
150
151 part :: (Eq p, Fractional p) => p -> p -> p
152 part 0 _ = 0
153 part _ 0 = 0
154 part x y = x / y
155
156 {-
157 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
158 t (a, b) -> t (a, b)
159 toProba xs = map (\(a,b) -> (a, part b total)) xs
160 where
161 total = sum $ map snd xs
162 -}
163 -- | TODO: monoids
164 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
165 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
166 where
167 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
168 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
169
170 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
171 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
172
173 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
174 opEvent f = DM.unionWith (op f)
175
176 ------------------------------------------------------------------------
177 ------------------------------------------------------------------------
178 data EventBook = EventBook { events_freq :: Map String Freq
179 , events_n :: Map StringSize TotalFreq
180 }
181 deriving (Show, Generic)
182
183 instance Serialise EventBook
184
185 instance (Serialise a, Ord a) => SaveFile (Events a) where
186 saveFile' f d = BSL.writeFile f (serialise d)
187
188 instance (Serialise a, Ord a) => ReadFile (Events a) where
189 readFile' filepath = deserialise <$> BSL.readFile filepath
190
191
192 emptyEventBook :: [Int] -> Int -> EventBook
193 emptyEventBook ns n = wordToBook ns n " "
194
195 wordsToBook :: [Int] -> Int -> String -> EventBook
196 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
197 where
198 ws = map unpack $ words $ pack txt
199 eventsBook = map (wordToBook ns n) ws
200
201 wordToBook :: [Int] -> Int -> Word -> EventBook
202 wordToBook ns n txt = EventBook ef en
203 where
204 chks = allChunks ns n txt
205 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
206 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
207
208 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
209 op f (EventBook ef1 en1)
210 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
211 (DM.unionWith f en1 en2)
212
213 ------------------------------------------------------------------------
214 ------------------------------------------------------------------------
215 allChunks :: [Int] -> Int -> String -> [[String]]
216 allChunks ns m st = map (\n -> chunks n m st) ns
217
218 -- | Chunks is the same function as splitBy in Context but for Strings,
219 -- not Text (without pack and unpack operations that are not needed).
220 chunks :: Int -> Int -> String -> [String]
221 chunks n m = DL.take m . filter (not . all (== ' '))
222 . chunkAlong (n+1) 1
223 . DL.concat
224 . DL.take 1000
225 . DL.repeat
226 . blanks
227
228 -- | String preparation
229 blanks :: String -> String
230 blanks [] = []
231 blanks xs = [' '] <> xs <> [' ']
232
233
234 {-
235 -- Some previous tests to be removed
236 --import GHC.Base (Functor)
237 --import Numeric.Probability.Distribution ((??))
238 --import qualified Numeric.Probability.Distribution as D
239
240 -- | Blocks increase the size of the word to ease computations
241 -- some border and unexepected effects can happen, need to be tested
242 blockOf :: Int -> String -> String
243 blockOf n = DL.concat . DL.take n . DL.repeat
244
245 -- * Make the distributions
246 makeDist :: [String] -> D.T Double String
247 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
248
249 stopDist :: D.T Double String
250 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
251
252 candDist :: D.T Double String
253 candDist = makeDist candList
254
255 ------------------------------------------------------------------------
256 sumProba :: Num a => D.T a String -> [Char] -> a
257 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
258
259 -- | Get probability according a distribution
260 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
261 (~?) ds x = (==x) ?? ds
262
263 ------------------------------------------------------------------------
264 candidate :: [Char] -> Candidate
265 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
266
267 ------------------------------------------------------------------------
268 candList :: [String]
269 candList = [ "france", "alexandre", "mael", "constitution"
270 , "etats-unis", "associes", "car", "train", "spam"]
271
272 --}