]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Learn.hs
seuil
[gargantext.git] / src / Gargantext / Text / Learn.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
14
15 -}
16
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE DeriveGeneric #-}
20 {-# LANGUAGE TypeSynonymInstances #-}
21 {-# LANGUAGE FlexibleInstances #-}
22
23 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
24 where
25
26 import Codec.Serialise
27 import qualified Data.List as DL
28
29 import Data.Maybe (maybe)
30 import Data.Map.Strict (Map, toList)
31 import qualified Data.Map.Strict as DM
32
33 import GHC.Generics
34 import Data.String (String)
35
36 import Data.Text (Text)
37 import Data.Text (pack, unpack, toLower)
38 import Data.Tuple.Extra (both)
39 import qualified Data.ByteString.Lazy as BSL
40
41 import Gargantext.Prelude
42 import Gargantext.Prelude.Utils
43 import Gargantext.Core (Lang(..), allLangs)
44 import Gargantext.Text.Terms.Mono (words)
45 import Gargantext.Text.Metrics.Count (occurrencesWith)
46
47 import qualified Gargantext.Text.Samples.FR as FR
48 import qualified Gargantext.Text.Samples.EN as EN
49 --import qualified Gargantext.Text.Samples.DE as DE
50 --import qualified Gargantext.Text.Samples.SP as SP
51 --import qualified Gargantext.Text.Samples.CH as CH
52
53 ------------------------------------------------------------------------
54 data Candidate = Candidate { stop :: Double
55 , noStop :: Double
56 } deriving (Show)
57
58 ------------------------------------------------------------------------
59 -- * Analyze candidate
60 type StringSize = Int
61 type TotalFreq = Int
62 type Freq = Int
63 type Word = String
64
65 data CatWord a = CatWord a Word
66 type CatProb a = Map a Double
67
68 type Events a = Map a EventBook
69 ------------------------------------------------------------------------
70 data EventBook = EventBook { events_freq :: Map String Freq
71 , events_n :: Map StringSize TotalFreq
72 }
73 deriving (Show, Generic)
74
75 instance Serialise EventBook
76
77 instance (Serialise a, Ord a) => SaveFile (Events a) where
78 saveFile' f d = BSL.writeFile f (serialise d)
79
80 instance (Serialise a, Ord a) => ReadFile (Events a) where
81 readFile' filepath = deserialise <$> BSL.readFile filepath
82
83 ------------------------------------------------------------------------
84 detectStopDefault :: Text -> Maybe Bool
85 detectStopDefault = undefined
86
87 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
88 detectBool events = detectDefault False events
89
90 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
91 detectDefault = detectDefaultWith identity
92
93 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
94 detectDefaultWith f d events = detectDefaultWithPriors f ps
95 where
96 ps = priorEventsWith f d events
97
98 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
99 detectDefaultWithPriors f priors = detectCat 99 priors . f
100
101 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
102 priorEventsWith f d e = toEvents d [0..2] 10 es
103 where
104 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
105
106
107 ------------------------------------------------------------------------
108 detectLangDefault :: Text -> Maybe Lang
109 detectLangDefault = detectCat 99 eventLang
110 where
111 eventLang :: Events Lang
112 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
113
114 langWord :: Lang -> CatWord Lang
115 langWord l = CatWord l (textSample l)
116
117 textSample :: Lang -> String
118 textSample EN = EN.textSample
119 textSample FR = FR.textSample
120 textSample _ = panic "textSample: not impl yet"
121 --textSample DE = DE.textSample
122 --textSample SP = SP.textSample
123 --textSample CH = CH.textSample
124 ------------------------------------------------------------------------
125 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
126 detectCat n es = head . map fst . (detectCat' n es) . unpack
127 where
128 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
129 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
130 $ toList
131 $ detectWith n' es' (wordsToBook [0..2] n' s)
132
133
134 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
135 detectWith n'' el (EventBook mapFreq _) =
136 DM.unionsWith (+)
137 $ map DM.fromList
138 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
139 $ filter (\x -> fst x /= " ")
140 $ DM.toList mapFreq
141
142 -- | TODO: monoids (but proba >= 0)
143 toPrior :: Int -> String -> Events a -> [(a, Double)]
144 toPrior n'' s el = prior n'' $ pebLang s el
145 where
146 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
147 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
148
149 peb :: String -> EventBook -> (Freq, TotalFreq)
150 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
151 where
152 a = maybe 0 identity $ DM.lookup st mapFreq
153 b = maybe 1 identity $ DM.lookup (length st) mapN
154
155
156 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
157 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
158 (map (\(a,b) -> a / b) ps')
159 where
160 (ls, ps'') = DL.unzip ps
161 ps' = map (both fromIntegral) ps''
162
163 part :: (Eq p, Fractional p) => p -> p -> p
164 part 0 _ = 0
165 part _ 0 = 0
166 part x y = x / y
167
168 {-
169 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
170 t (a, b) -> t (a, b)
171 toProba xs = map (\(a,b) -> (a, part b total)) xs
172 where
173 total = sum $ map snd xs
174 -}
175 -- | TODO: monoids
176 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
177 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
178 where
179 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
180 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
181
182 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
183 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
184
185 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
186 opEvent f = DM.unionWith (op f)
187
188 ------------------------------------------------------------------------
189
190 emptyEventBook :: [Int] -> Int -> EventBook
191 emptyEventBook ns n = wordToBook ns n " "
192
193 wordsToBook :: [Int] -> Int -> String -> EventBook
194 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
195 where
196 ws = map unpack $ words $ pack txt
197 eventsBook = map (wordToBook ns n) ws
198
199 wordToBook :: [Int] -> Int -> Word -> EventBook
200 wordToBook ns n txt = EventBook ef en
201 where
202 chks = allChunks ns n txt
203 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
204 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
205
206 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
207 op f (EventBook ef1 en1)
208 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
209 (DM.unionWith f en1 en2)
210
211 ------------------------------------------------------------------------
212 ------------------------------------------------------------------------
213 allChunks :: [Int] -> Int -> String -> [[String]]
214 allChunks ns m st = map (\n -> chunks n m st) ns
215
216 -- | Chunks is the same function as splitBy in Context but for Strings,
217 -- not Text (without pack and unpack operations that are not needed).
218 chunks :: Int -> Int -> String -> [String]
219 chunks n m = DL.take m . filter (not . all (== ' '))
220 . chunkAlong (n+1) 1
221 . DL.concat
222 . DL.take 1000
223 . DL.repeat
224 . blanks
225
226 -- | String preparation
227 blanks :: String -> String
228 blanks [] = []
229 blanks xs = [' '] <> xs <> [' ']
230
231
232 {-
233 -- Some previous tests to be removed
234 --import GHC.Base (Functor)
235 --import Numeric.Probability.Distribution ((??))
236 --import qualified Numeric.Probability.Distribution as D
237
238 -- | Blocks increase the size of the word to ease computations
239 -- some border and unexepected effects can happen, need to be tested
240 blockOf :: Int -> String -> String
241 blockOf n = DL.concat . DL.take n . DL.repeat
242
243 -- * Make the distributions
244 makeDist :: [String] -> D.T Double String
245 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
246
247 stopDist :: D.T Double String
248 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
249
250 candDist :: D.T Double String
251 candDist = makeDist candList
252
253 ------------------------------------------------------------------------
254 sumProba :: Num a => D.T a String -> [Char] -> a
255 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
256
257 -- | Get probability according a distribution
258 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
259 (~?) ds x = (==x) ?? ds
260
261 ------------------------------------------------------------------------
262 candidate :: [Char] -> Candidate
263 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
264
265 ------------------------------------------------------------------------
266 candList :: [String]
267 candList = [ "france", "alexandre", "mael", "constitution"
268 , "etats-unis", "associes", "car", "train", "spam"]
269
270 --}