]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Learn.hs
RENAME FIX
[gargantext.git] / src / Gargantext / Text / Learn.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18
19 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
20 where
21
22 --import Data.Char (toLower)
23 import qualified Data.List as DL
24
25 import Data.Maybe (maybe)
26 import Data.Map.Strict (Map, toList)
27 import qualified Data.Map.Strict as DM
28
29 import Data.String (String)
30
31 import Data.Text (Text)
32 import Data.Text (pack, unpack, toLower)
33 import Data.Tuple.Extra (both)
34
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..), allLangs)
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
39
40 import qualified Gargantext.Text.Samples.FR as FR
41 import qualified Gargantext.Text.Samples.EN as EN
42 --import qualified Gargantext.Text.Samples.DE as DE
43 --import qualified Gargantext.Text.Samples.SP as SP
44 --import qualified Gargantext.Text.Samples.CH as CH
45
46 ------------------------------------------------------------------------
47 data Candidate = Candidate { stop :: Double
48 , noStop :: Double
49 } deriving (Show)
50
51 ------------------------------------------------------------------------
52 -- * Analyze candidate
53 type StringSize = Int
54 type TotalFreq = Int
55 type Freq = Int
56 type Word = String
57
58 data CatWord a = CatWord a Word
59 type CatProb a = Map a Double
60
61 type Events a = Map a EventBook
62
63 ------------------------------------------------------------------------
64 detectLangDefault :: Text -> Maybe Lang
65 detectLangDefault = detectCat 99 eventLang
66 where
67 eventLang :: Events Lang
68 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
69
70 langWord :: Lang -> CatWord Lang
71 langWord l = CatWord l (textSample l)
72
73 textSample :: Lang -> String
74 textSample EN = EN.textSample
75 textSample FR = FR.textSample
76 --textSample DE = DE.textSample
77 --textSample SP = SP.textSample
78 --textSample CH = CH.textSample
79
80 detectStopDefault :: Text -> Maybe Bool
81 detectStopDefault = undefined
82
83 detectDefault :: [(Bool, Text)] -> Text -> Maybe Bool
84 detectDefault events = detectCat 99 (priorEvents events)
85 where
86 priorEvents events' = toEvents True [0..2] 10 (map (\(a,b) -> CatWord a (unpack $ toLower b)) events')
87
88 ------------------------------------------------------------------------
89 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
90 detectCat n es = head . map fst . (detectCat' n es) . unpack
91 where
92 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
93 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
94 $ toList
95 $ detectWith n' es' (wordsToBook [0..2] n' s)
96
97
98 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
99 detectWith n'' el (EventBook mapFreq _) =
100 DM.unionsWith (+)
101 $ map DM.fromList
102 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
103 $ filter (\x -> fst x /= " ")
104 $ DM.toList mapFreq
105
106 -- | TODO: monoids (but proba >= 0)
107 toPrior :: Int -> String -> Events a -> [(a, Double)]
108 toPrior n'' s el = prior n'' $ pebLang s el
109 where
110 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
111 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
112
113 peb :: String -> EventBook -> (Freq, TotalFreq)
114 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
115 where
116 a = maybe 0 identity $ DM.lookup st mapFreq
117 b = maybe 1 identity $ DM.lookup (length st) mapN
118
119
120 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
121 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
122 (map (\(a,b) -> a / b) ps')
123 where
124 (ls, ps'') = DL.unzip ps
125 ps' = map (both fromIntegral) ps''
126
127 part :: (Eq p, Fractional p) => p -> p -> p
128 part 0 _ = 0
129 part _ 0 = 0
130 part x y = x / y
131
132 {-
133 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
134 t (a, b) -> t (a, b)
135 toProba xs = map (\(a,b) -> (a, part b total)) xs
136 where
137 total = sum $ map snd xs
138 -}
139 -- | TODO: monoids
140 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
141 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
142 where
143 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
144 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
145
146 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
147 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
148
149 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
150 opEvent f = DM.unionWith (op f)
151
152 ------------------------------------------------------------------------
153 ------------------------------------------------------------------------
154 data EventBook = EventBook { events_freq :: Map String Freq
155 , events_n :: Map StringSize TotalFreq
156 }
157 deriving (Show)
158
159 emptyEventBook :: [Int] -> Int -> EventBook
160 emptyEventBook ns n = wordToBook ns n " "
161
162 wordsToBook :: [Int] -> Int -> String -> EventBook
163 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
164 where
165 ws = map unpack $ words $ pack txt
166 eventsBook = map (wordToBook ns n) ws
167
168 wordToBook :: [Int] -> Int -> Word -> EventBook
169 wordToBook ns n txt = EventBook ef en
170 where
171 chks = allChunks ns n txt
172 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
173 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
174
175 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
176 op f (EventBook ef1 en1)
177 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
178 (DM.unionWith f en1 en2)
179
180 ------------------------------------------------------------------------
181 ------------------------------------------------------------------------
182 allChunks :: [Int] -> Int -> String -> [[String]]
183 allChunks ns m st = map (\n -> chunks n m st) ns
184
185 -- | Chunks is the same function as splitBy in Context but for Strings,
186 -- not Text (without pack and unpack operations that are not needed).
187 chunks :: Int -> Int -> String -> [String]
188 chunks n m = DL.take m . filter (not . all (== ' '))
189 . chunkAlong (n+1) 1
190 . DL.concat
191 . DL.take 1000
192 . DL.repeat
193 . blanks
194
195 -- | String preparation
196 blanks :: String -> String
197 blanks [] = []
198 blanks xs = [' '] <> xs <> [' ']
199
200
201 {-
202 -- Some previous tests to be removed
203 --import GHC.Base (Functor)
204 --import Numeric.Probability.Distribution ((??))
205 --import qualified Numeric.Probability.Distribution as D
206
207 -- | Blocks increase the size of the word to ease computations
208 -- some border and unexepected effects can happen, need to be tested
209 blockOf :: Int -> String -> String
210 blockOf n = DL.concat . DL.take n . DL.repeat
211
212 -- * Make the distributions
213 makeDist :: [String] -> D.T Double String
214 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
215
216 stopDist :: D.T Double String
217 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
218
219 candDist :: D.T Double String
220 candDist = makeDist candList
221
222 ------------------------------------------------------------------------
223 sumProba :: Num a => D.T a String -> [Char] -> a
224 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
225
226 -- | Get probability according a distribution
227 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
228 (~?) ds x = (==x) ?? ds
229
230 ------------------------------------------------------------------------
231 candidate :: [Char] -> Candidate
232 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
233
234 ------------------------------------------------------------------------
235 candList :: [String]
236 candList = [ "france", "alexandre", "mael", "constitution"
237 , "etats-unis", "associes", "car", "train", "spam"]
238
239 --}