]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Learn.hs
[REFACTO] FLOW DEV
[gargantext.git] / src / Gargantext / Text / Learn.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12
13 Stop words and (how to learn it).
14
15 Main type here is String.
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21
22 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
23 where
24
25 --import Data.Char (toLower)
26 import qualified Data.List as DL
27
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
31
32 import Data.String (String)
33
34 import Data.Text (Text)
35 import Data.Text (pack, unpack, toLower)
36 import Data.Tuple.Extra (both)
37
38 import Gargantext.Prelude
39 import Gargantext.Core (Lang(..), allLangs)
40 import Gargantext.Text.Terms.Mono (words)
41 import Gargantext.Text.Metrics.Count (occurrencesWith)
42
43 import qualified Gargantext.Text.Samples.FR as FR
44 import qualified Gargantext.Text.Samples.EN as EN
45 --import qualified Gargantext.Text.Samples.DE as DE
46 --import qualified Gargantext.Text.Samples.SP as SP
47 --import qualified Gargantext.Text.Samples.CH as CH
48
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
51 , noStop :: Double
52 } deriving (Show)
53
54 ------------------------------------------------------------------------
55 -- * Analyze candidate
56 type StringSize = Int
57 type TotalFreq = Int
58 type Freq = Int
59 type Word = String
60
61 data CatWord a = CatWord a Word
62 type CatProb a = Map a Double
63
64 type Events a = Map a EventBook
65
66 ------------------------------------------------------------------------
67 detectStopDefault :: Text -> Maybe Bool
68 detectStopDefault = undefined
69
70 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
71 detectBool events = detectDefault False events
72
73 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
74 detectDefault = detectDefaultWith identity
75
76 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
77 detectDefaultWith f d events = detectDefaultWithPriors f ps
78 where
79 ps = priorEventsWith f d events
80
81 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
82 detectDefaultWithPriors f priors = detectCat 99 priors . f
83
84 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
85 priorEventsWith f d e = toEvents d [0..2] 10 es
86 where
87 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
88
89
90 ------------------------------------------------------------------------
91 detectLangDefault :: Text -> Maybe Lang
92 detectLangDefault = detectCat 99 eventLang
93 where
94 eventLang :: Events Lang
95 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
96
97 langWord :: Lang -> CatWord Lang
98 langWord l = CatWord l (textSample l)
99
100 textSample :: Lang -> String
101 textSample EN = EN.textSample
102 textSample FR = FR.textSample
103 --textSample DE = DE.textSample
104 --textSample SP = SP.textSample
105 --textSample CH = CH.textSample
106 ------------------------------------------------------------------------
107 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
108 detectCat n es = head . map fst . (detectCat' n es) . unpack
109 where
110 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
111 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
112 $ toList
113 $ detectWith n' es' (wordsToBook [0..2] n' s)
114
115
116 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
117 detectWith n'' el (EventBook mapFreq _) =
118 DM.unionsWith (+)
119 $ map DM.fromList
120 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
121 $ filter (\x -> fst x /= " ")
122 $ DM.toList mapFreq
123
124 -- | TODO: monoids (but proba >= 0)
125 toPrior :: Int -> String -> Events a -> [(a, Double)]
126 toPrior n'' s el = prior n'' $ pebLang s el
127 where
128 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
129 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
130
131 peb :: String -> EventBook -> (Freq, TotalFreq)
132 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
133 where
134 a = maybe 0 identity $ DM.lookup st mapFreq
135 b = maybe 1 identity $ DM.lookup (length st) mapN
136
137
138 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
139 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
140 (map (\(a,b) -> a / b) ps')
141 where
142 (ls, ps'') = DL.unzip ps
143 ps' = map (both fromIntegral) ps''
144
145 part :: (Eq p, Fractional p) => p -> p -> p
146 part 0 _ = 0
147 part _ 0 = 0
148 part x y = x / y
149
150 {-
151 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
152 t (a, b) -> t (a, b)
153 toProba xs = map (\(a,b) -> (a, part b total)) xs
154 where
155 total = sum $ map snd xs
156 -}
157 -- | TODO: monoids
158 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
159 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
160 where
161 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
162 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
163
164 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
165 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
166
167 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
168 opEvent f = DM.unionWith (op f)
169
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 data EventBook = EventBook { events_freq :: Map String Freq
173 , events_n :: Map StringSize TotalFreq
174 }
175 deriving (Show)
176
177 emptyEventBook :: [Int] -> Int -> EventBook
178 emptyEventBook ns n = wordToBook ns n " "
179
180 wordsToBook :: [Int] -> Int -> String -> EventBook
181 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
182 where
183 ws = map unpack $ words $ pack txt
184 eventsBook = map (wordToBook ns n) ws
185
186 wordToBook :: [Int] -> Int -> Word -> EventBook
187 wordToBook ns n txt = EventBook ef en
188 where
189 chks = allChunks ns n txt
190 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
191 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
192
193 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
194 op f (EventBook ef1 en1)
195 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
196 (DM.unionWith f en1 en2)
197
198 ------------------------------------------------------------------------
199 ------------------------------------------------------------------------
200 allChunks :: [Int] -> Int -> String -> [[String]]
201 allChunks ns m st = map (\n -> chunks n m st) ns
202
203 -- | Chunks is the same function as splitBy in Context but for Strings,
204 -- not Text (without pack and unpack operations that are not needed).
205 chunks :: Int -> Int -> String -> [String]
206 chunks n m = DL.take m . filter (not . all (== ' '))
207 . chunkAlong (n+1) 1
208 . DL.concat
209 . DL.take 1000
210 . DL.repeat
211 . blanks
212
213 -- | String preparation
214 blanks :: String -> String
215 blanks [] = []
216 blanks xs = [' '] <> xs <> [' ']
217
218
219 {-
220 -- Some previous tests to be removed
221 --import GHC.Base (Functor)
222 --import Numeric.Probability.Distribution ((??))
223 --import qualified Numeric.Probability.Distribution as D
224
225 -- | Blocks increase the size of the word to ease computations
226 -- some border and unexepected effects can happen, need to be tested
227 blockOf :: Int -> String -> String
228 blockOf n = DL.concat . DL.take n . DL.repeat
229
230 -- * Make the distributions
231 makeDist :: [String] -> D.T Double String
232 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
233
234 stopDist :: D.T Double String
235 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
236
237 candDist :: D.T Double String
238 candDist = makeDist candList
239
240 ------------------------------------------------------------------------
241 sumProba :: Num a => D.T a String -> [Char] -> a
242 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
243
244 -- | Get probability according a distribution
245 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
246 (~?) ds x = (==x) ?? ds
247
248 ------------------------------------------------------------------------
249 candidate :: [Char] -> Candidate
250 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
251
252 ------------------------------------------------------------------------
253 candList :: [String]
254 candList = [ "france", "alexandre", "mael", "constitution"
255 , "etats-unis", "associes", "car", "train", "spam"]
256
257 --}