]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Learn.hs
Merge branch 'dev' into dev-list-charts
[gargantext.git] / src / Gargantext / Text / Learn.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
14
15 -}
16
17 {-# LANGUAGE TypeSynonymInstances #-}
18
19 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
20 where
21
22 import Codec.Serialise
23 import qualified Data.List as DL
24
25 import Data.Maybe (maybe)
26 import Data.Map.Strict (Map, toList)
27 import qualified Data.Map.Strict as DM
28
29 import GHC.Generics
30 import Data.String (String)
31
32 import Data.Text (Text)
33 import Data.Text (pack, unpack, toLower)
34 import Data.Tuple.Extra (both)
35 import qualified Data.ByteString.Lazy as BSL
36
37 import Gargantext.Prelude
38 import Gargantext.Prelude.Utils
39 import Gargantext.Core (Lang(..), allLangs)
40 import Gargantext.Text.Terms.Mono (words)
41 import Gargantext.Text.Metrics.Count (occurrencesWith)
42
43 import qualified Gargantext.Text.Samples.FR as FR
44 import qualified Gargantext.Text.Samples.EN as EN
45 --import qualified Gargantext.Text.Samples.DE as DE
46 --import qualified Gargantext.Text.Samples.SP as SP
47 --import qualified Gargantext.Text.Samples.CH as CH
48
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
51 , noStop :: Double
52 } deriving (Show)
53
54 ------------------------------------------------------------------------
55 -- * Analyze candidate
56 type StringSize = Int
57 type TotalFreq = Int
58 type Freq = Int
59 type Word = String
60
61 data CatWord a = CatWord a Word
62 type CatProb a = Map a Double
63
64 type Events a = Map a EventBook
65 ------------------------------------------------------------------------
66 data EventBook = EventBook { events_freq :: Map String Freq
67 , events_n :: Map StringSize TotalFreq
68 }
69 deriving (Show, Generic)
70
71 instance Serialise EventBook
72
73 instance (Serialise a, Ord a) => SaveFile (Events a) where
74 saveFile' f d = BSL.writeFile f (serialise d)
75
76 instance (Serialise a, Ord a) => ReadFile (Events a) where
77 readFile' filepath = deserialise <$> BSL.readFile filepath
78
79 ------------------------------------------------------------------------
80 detectStopDefault :: Text -> Maybe Bool
81 detectStopDefault = undefined
82
83 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
84 detectBool events = detectDefault False events
85
86 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
87 detectDefault = detectDefaultWith identity
88
89 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
90 detectDefaultWith f d events = detectDefaultWithPriors f ps
91 where
92 ps = priorEventsWith f d events
93
94 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
95 detectDefaultWithPriors f priors = detectCat 99 priors . f
96
97 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
98 priorEventsWith f d e = toEvents d [0..2] 10 es
99 where
100 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
101
102
103 ------------------------------------------------------------------------
104 detectLangDefault :: Text -> Maybe Lang
105 detectLangDefault = detectCat 99 eventLang
106 where
107 eventLang :: Events Lang
108 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
109
110 langWord :: Lang -> CatWord Lang
111 langWord l = CatWord l (textSample l)
112
113 textSample :: Lang -> String
114 textSample EN = EN.textSample
115 textSample FR = FR.textSample
116 textSample _ = panic "textSample: not impl yet"
117 --textSample DE = DE.textSample
118 --textSample SP = SP.textSample
119 --textSample CH = CH.textSample
120 ------------------------------------------------------------------------
121 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
122 detectCat n es = head . map fst . (detectCat' n es) . unpack
123 where
124 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
125 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
126 $ toList
127 $ detectWith n' es' (wordsToBook [0..2] n' s)
128
129
130 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
131 detectWith n'' el (EventBook mapFreq _) =
132 DM.unionsWith (+)
133 $ map DM.fromList
134 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
135 $ filter (\x -> fst x /= " ")
136 $ DM.toList mapFreq
137
138 -- | TODO: monoids (but proba >= 0)
139 toPrior :: Int -> String -> Events a -> [(a, Double)]
140 toPrior n'' s el = prior n'' $ pebLang s el
141 where
142 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
143 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
144
145 peb :: String -> EventBook -> (Freq, TotalFreq)
146 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
147 where
148 a = maybe 0 identity $ DM.lookup st mapFreq
149 b = maybe 1 identity $ DM.lookup (length st) mapN
150
151
152 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
153 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
154 (map (\(a,b) -> a / b) ps')
155 where
156 (ls, ps'') = DL.unzip ps
157 ps' = map (both fromIntegral) ps''
158
159 part :: (Eq p, Fractional p) => p -> p -> p
160 part 0 _ = 0
161 part _ 0 = 0
162 part x y = x / y
163
164 {-
165 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
166 t (a, b) -> t (a, b)
167 toProba xs = map (\(a,b) -> (a, part b total)) xs
168 where
169 total = sum $ map snd xs
170 -}
171 -- | TODO: monoids
172 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
173 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
174 where
175 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
176 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
177
178 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
179 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
180
181 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
182 opEvent f = DM.unionWith (op f)
183
184 ------------------------------------------------------------------------
185
186 emptyEventBook :: [Int] -> Int -> EventBook
187 emptyEventBook ns n = wordToBook ns n " "
188
189 wordsToBook :: [Int] -> Int -> String -> EventBook
190 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
191 where
192 ws = map unpack $ words $ pack txt
193 eventsBook = map (wordToBook ns n) ws
194
195 wordToBook :: [Int] -> Int -> Word -> EventBook
196 wordToBook ns n txt = EventBook ef en
197 where
198 chks = allChunks ns n txt
199 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
200 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
201
202 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
203 op f (EventBook ef1 en1)
204 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
205 (DM.unionWith f en1 en2)
206
207 ------------------------------------------------------------------------
208 ------------------------------------------------------------------------
209 allChunks :: [Int] -> Int -> String -> [[String]]
210 allChunks ns m st = map (\n -> chunks n m st) ns
211
212 -- | Chunks is the same function as splitBy in Context but for Strings,
213 -- not Text (without pack and unpack operations that are not needed).
214 chunks :: Int -> Int -> String -> [String]
215 chunks n m = DL.take m . filter (not . all (== ' '))
216 . chunkAlong (n+1) 1
217 . DL.concat
218 . DL.take 1000
219 . DL.repeat
220 . blanks
221
222 -- | String preparation
223 blanks :: String -> String
224 blanks [] = []
225 blanks xs = [' '] <> xs <> [' ']
226
227
228 {-
229 -- Some previous tests to be removed
230 --import GHC.Base (Functor)
231 --import Numeric.Probability.Distribution ((??))
232 --import qualified Numeric.Probability.Distribution as D
233
234 -- | Blocks increase the size of the word to ease computations
235 -- some border and unexepected effects can happen, need to be tested
236 blockOf :: Int -> String -> String
237 blockOf n = DL.concat . DL.take n . DL.repeat
238
239 -- * Make the distributions
240 makeDist :: [String] -> D.T Double String
241 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
242
243 stopDist :: D.T Double String
244 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
245
246 candDist :: D.T Double String
247 candDist = makeDist candList
248
249 ------------------------------------------------------------------------
250 sumProba :: Num a => D.T a String -> [Char] -> a
251 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
252
253 -- | Get probability according a distribution
254 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
255 (~?) ds x = (==x) ?? ds
256
257 ------------------------------------------------------------------------
258 candidate :: [Char] -> Candidate
259 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
260
261 ------------------------------------------------------------------------
262 candList :: [String]
263 candList = [ "france", "alexandre", "mael", "constitution"
264 , "etats-unis", "associes", "car", "train", "spam"]
265
266 --}