]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[TEXTLINE] adding CSV format parser.
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22
23 module Gargantext.Text.Metrics
24 where
25
26 import Data.Text (Text, pack)
27 import Data.Map (Map)
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import qualified Data.Vector as V
33 import qualified Data.Vector.Unboxed as VU
34 import Data.Tuple.Extra (both)
35 --import GHC.Real (Ratio)
36 --import qualified Data.Text.Metrics as DTM
37 import Data.Array.Accelerate (toList)
38 import Math.KMeans (kmeans, euclidSq, elements)
39
40
41 import Gargantext.Prelude
42
43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
45 import Gargantext.Core (Lang(EN))
46 import Gargantext.Core.Types (Terms(..))
47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
48
49 import Gargantext.Viz.Graph.Distances.Matrice
50 import Gargantext.Viz.Graph.Index
51
52 import qualified Data.Array.Accelerate.Interpreter as DAA
53 import qualified Data.Array.Accelerate as DAA
54 -- import Data.Array.Accelerate ((:.)(..), Z(..))
55
56 import GHC.Real (round)
57
58 import Debug.Trace
59 import Prelude (seq)
60
61 data MapListSize = MapListSize Int
62 data InclusionSize = InclusionSize Int
63 data SampleBins = SampleBins Double
64 data Clusters = Clusters Int
65 data DefaultValue = DefaultValue Int
66
67 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
68 , fc_inclusionSize :: InclusionSize
69 , fc_sampleBins :: SampleBins
70 , fc_clusters :: Clusters
71 , fc_defaultValue :: DefaultValue
72 }
73
74 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
75 filterCooc fc cc = (filterCooc' fc) ts cc
76 where
77 ts = map _scored_terms $ takeSome fc $ coocScored cc
78
79
80 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
81 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
82 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
83 M.empty selection
84 where
85 selection = [(x,y) | x <- ts
86 , y <- ts
87 -- , x >= y
88 ]
89
90
91 -- | Map list creation
92 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
93 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
94 -- each parts is then ordered by Inclusion/Exclusion
95 -- take n scored terms in each parts where n * SampleBins = MapListSize.
96 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
97 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
98 $ takeSample n m
99 $ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
100 -- $ splitKmeans k scores
101 where
102 -- TODO: benchmark with accelerate-example kmeans version
103 splitKmeans x xs = L.concat $ map elements
104 $ V.take (k-1)
105 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
106 euclidSq x xs
107 n = round ((fromIntegral l)/s)
108 m = round $ (fromIntegral $ length scores) / (s)
109 takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
110 L.concat $ map (L.take n)
111 $ map (reverse . (L.sortOn _scored_incExc))
112 -- TODO use kmeans s instead of splitEvery
113 -- in order to split in s heteregenous parts
114 -- without homogeneous order hypothesis
115 $ splitEvery m
116 $ L.reverse $ L.sortOn _scored_speGen xs
117
118
119 data Scored t = Scored { _scored_terms :: !t
120 , _scored_incExc :: !InclusionExclusion
121 , _scored_speGen :: !SpecificityGenericity
122 } deriving (Show)
123
124 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
125 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
126 where
127 (ti,fi) = createIndices m
128 (is, ss) = incExcSpeGen $ cooc2mat ti m
129 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
151 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
152 where
153 (ti,fi) = createIndices m
154 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
155
156
157
158 metrics_text :: Text
159 metrics_text = T.intercalate " " metrics_sentences
160
161 metrics_sentences' :: [Text]
162 metrics_sentences' = splitBy (Sentences 0) metrics_text
163
164 -- | Sentences
165 metrics_sentences :: [Text]
166 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
167 , "I can see the glass on the table."
168 , "There was only a spoon on that table."
169 , "The glass just fall from the table, pouring wine everywhere."
170 , "I wish the glass did not contain wine."
171 ]
172
173 metrics_sentences_Test = metrics_sentences == metrics_sentences'
174
175 -- | Terms reordered to visually check occurrences
176 -- >>>
177 {- [ [["table"],["glass"],["wine"],["spoon"]]
178 , [["glass"],["table"]]
179 , [["spoon"],["table"]]
180 , [["glass"],["table"],["wine"]]
181 , [["glass"],["wine"]]
182 ]
183 -}
184
185 metrics_terms :: IO [[Terms]]
186 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
187
188 -- | Occurrences
189 {-
190 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
191 , (fromList ["object"],fromList [(["object"], 3 )])
192 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
193 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
194 -}
195 metrics_occ = occurrences <$> L.concat <$> metrics_terms
196
197 {-
198 -- fromList [((["glas"],["object"]),6)
199 ,((["glas"],["spoon"]),4)
200 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
201
202 -}
203 metrics_cooc = cooc <$> metrics_terms
204
205 metrics_cooc_mat = do
206 m <- metrics_cooc
207 let (ti,_) = createIndices m
208 let mat_cooc = cooc2mat ti m
209 pure ( ti
210 , mat_cooc
211 , incExcSpeGen_proba mat_cooc
212 , incExcSpeGen mat_cooc
213 )
214
215 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
216