]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Learn.hs
no logs
[gargantext.git] / src / Gargantext / Text / Learn.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
14
15 -}
16
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE DeriveGeneric #-}
20 {-# LANGUAGE TypeSynonymInstances #-}
21 {-# LANGUAGE FlexibleInstances #-}
22
23 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
24 where
25
26 import Codec.Serialise
27 import qualified Data.List as DL
28
29 import Data.Maybe (maybe)
30 import Data.Map.Strict (Map, toList)
31 import qualified Data.Map.Strict as DM
32
33 import GHC.Generics
34 import Data.String (String)
35
36 import Data.Text (Text)
37 import Data.Text (pack, unpack, toLower)
38 import Data.Tuple.Extra (both)
39 import qualified Data.ByteString.Lazy as BSL
40
41 import Gargantext.Prelude
42 import Gargantext.Prelude.Utils
43 import Gargantext.Core (Lang(..), allLangs)
44 import Gargantext.Text.Terms.Mono (words)
45 import Gargantext.Text.Metrics.Count (occurrencesWith)
46
47 import qualified Gargantext.Text.Samples.FR as FR
48 import qualified Gargantext.Text.Samples.EN as EN
49 --import qualified Gargantext.Text.Samples.DE as DE
50 --import qualified Gargantext.Text.Samples.SP as SP
51 --import qualified Gargantext.Text.Samples.CH as CH
52
53 ------------------------------------------------------------------------
54 data Candidate = Candidate { stop :: Double
55 , noStop :: Double
56 } deriving (Show)
57
58 ------------------------------------------------------------------------
59 -- * Analyze candidate
60 type StringSize = Int
61 type TotalFreq = Int
62 type Freq = Int
63 type Word = String
64
65 data CatWord a = CatWord a Word
66 type CatProb a = Map a Double
67
68 type Events a = Map a EventBook
69 ------------------------------------------------------------------------
70 data EventBook = EventBook { events_freq :: Map String Freq
71 , events_n :: Map StringSize TotalFreq
72 }
73 deriving (Show, Generic)
74
75 instance Serialise EventBook
76
77 instance (Serialise a, Ord a) => SaveFile (Events a) where
78 saveFile' f d = BSL.writeFile f (serialise d)
79
80 instance (Serialise a, Ord a) => ReadFile (Events a) where
81 readFile' filepath = deserialise <$> BSL.readFile filepath
82
83 ------------------------------------------------------------------------
84 detectStopDefault :: Text -> Maybe Bool
85 detectStopDefault = undefined
86
87 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
88 detectBool events = detectDefault False events
89
90 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
91 detectDefault = detectDefaultWith identity
92
93 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
94 detectDefaultWith f d events = detectDefaultWithPriors f ps
95 where
96 ps = priorEventsWith f d events
97
98 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
99 detectDefaultWithPriors f priors = detectCat 99 priors . f
100
101 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
102 priorEventsWith f d e = toEvents d [0..2] 10 es
103 where
104 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
105
106
107 ------------------------------------------------------------------------
108 detectLangDefault :: Text -> Maybe Lang
109 detectLangDefault = detectCat 99 eventLang
110 where
111 eventLang :: Events Lang
112 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
113
114 langWord :: Lang -> CatWord Lang
115 langWord l = CatWord l (textSample l)
116
117 textSample :: Lang -> String
118 textSample EN = EN.textSample
119 textSample FR = FR.textSample
120 --textSample DE = DE.textSample
121 --textSample SP = SP.textSample
122 --textSample CH = CH.textSample
123 ------------------------------------------------------------------------
124 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
125 detectCat n es = head . map fst . (detectCat' n es) . unpack
126 where
127 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
128 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
129 $ toList
130 $ detectWith n' es' (wordsToBook [0..2] n' s)
131
132
133 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
134 detectWith n'' el (EventBook mapFreq _) =
135 DM.unionsWith (+)
136 $ map DM.fromList
137 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
138 $ filter (\x -> fst x /= " ")
139 $ DM.toList mapFreq
140
141 -- | TODO: monoids (but proba >= 0)
142 toPrior :: Int -> String -> Events a -> [(a, Double)]
143 toPrior n'' s el = prior n'' $ pebLang s el
144 where
145 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
146 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
147
148 peb :: String -> EventBook -> (Freq, TotalFreq)
149 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
150 where
151 a = maybe 0 identity $ DM.lookup st mapFreq
152 b = maybe 1 identity $ DM.lookup (length st) mapN
153
154
155 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
156 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
157 (map (\(a,b) -> a / b) ps')
158 where
159 (ls, ps'') = DL.unzip ps
160 ps' = map (both fromIntegral) ps''
161
162 part :: (Eq p, Fractional p) => p -> p -> p
163 part 0 _ = 0
164 part _ 0 = 0
165 part x y = x / y
166
167 {-
168 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
169 t (a, b) -> t (a, b)
170 toProba xs = map (\(a,b) -> (a, part b total)) xs
171 where
172 total = sum $ map snd xs
173 -}
174 -- | TODO: monoids
175 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
176 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
177 where
178 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
179 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
180
181 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
182 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
183
184 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
185 opEvent f = DM.unionWith (op f)
186
187 ------------------------------------------------------------------------
188
189 emptyEventBook :: [Int] -> Int -> EventBook
190 emptyEventBook ns n = wordToBook ns n " "
191
192 wordsToBook :: [Int] -> Int -> String -> EventBook
193 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
194 where
195 ws = map unpack $ words $ pack txt
196 eventsBook = map (wordToBook ns n) ws
197
198 wordToBook :: [Int] -> Int -> Word -> EventBook
199 wordToBook ns n txt = EventBook ef en
200 where
201 chks = allChunks ns n txt
202 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
203 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
204
205 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
206 op f (EventBook ef1 en1)
207 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
208 (DM.unionWith f en1 en2)
209
210 ------------------------------------------------------------------------
211 ------------------------------------------------------------------------
212 allChunks :: [Int] -> Int -> String -> [[String]]
213 allChunks ns m st = map (\n -> chunks n m st) ns
214
215 -- | Chunks is the same function as splitBy in Context but for Strings,
216 -- not Text (without pack and unpack operations that are not needed).
217 chunks :: Int -> Int -> String -> [String]
218 chunks n m = DL.take m . filter (not . all (== ' '))
219 . chunkAlong (n+1) 1
220 . DL.concat
221 . DL.take 1000
222 . DL.repeat
223 . blanks
224
225 -- | String preparation
226 blanks :: String -> String
227 blanks [] = []
228 blanks xs = [' '] <> xs <> [' ']
229
230
231 {-
232 -- Some previous tests to be removed
233 --import GHC.Base (Functor)
234 --import Numeric.Probability.Distribution ((??))
235 --import qualified Numeric.Probability.Distribution as D
236
237 -- | Blocks increase the size of the word to ease computations
238 -- some border and unexepected effects can happen, need to be tested
239 blockOf :: Int -> String -> String
240 blockOf n = DL.concat . DL.take n . DL.repeat
241
242 -- * Make the distributions
243 makeDist :: [String] -> D.T Double String
244 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
245
246 stopDist :: D.T Double String
247 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
248
249 candDist :: D.T Double String
250 candDist = makeDist candList
251
252 ------------------------------------------------------------------------
253 sumProba :: Num a => D.T a String -> [Char] -> a
254 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
255
256 -- | Get probability according a distribution
257 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
258 (~?) ds x = (==x) ?? ds
259
260 ------------------------------------------------------------------------
261 candidate :: [Char] -> Candidate
262 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
263
264 ------------------------------------------------------------------------
265 candList :: [String]
266 candList = [ "france", "alexandre", "mael", "constitution"
267 , "etats-unis", "associes", "car", "train", "spam"]
268
269 --}