]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[WithList] merge function is right thx to @npouillard
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22
23 module Gargantext.Text.Metrics
24 where
25
26 import Data.Ord (Down(..))
27 import qualified Data.List as L
28
29 import Data.Map (Map)
30 import qualified Data.Map as M
31
32 import Data.Text (Text)
33 import qualified Data.Text as T
34
35 import Data.Tuple.Extra (both)
36 --import GHC.Real (Ratio)
37 --import qualified Data.Text.Metrics as DTM
38 import Data.Array.Accelerate (toList, Matrix)
39 --import Math.KMeans (kmeans, euclidSq, elements)
40
41 import Gargantext.Prelude
42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
44 import Gargantext.Core (Lang(EN))
45 import Gargantext.Core.Types (Terms(..), Label)
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
47 import Gargantext.Text.Metrics.Count (Grouped)
48 import Gargantext.Viz.Graph.Distances.Matrice
49 import Gargantext.Viz.Graph.Index
50
51 import qualified Data.Array.Accelerate.Interpreter as DAA
52 import qualified Data.Array.Accelerate as DAA
53 -- import Data.Array.Accelerate ((:.)(..), Z(..))
54
55 import GHC.Real (round)
56
57 --import Debug.Trace
58
59 data MapListSize = MapListSize Int
60 data InclusionSize = InclusionSize Int
61 data SampleBins = SampleBins Double
62 data Clusters = Clusters Int
63 data DefaultValue = DefaultValue Int
64
65 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
66 , fc_inclusionSize :: InclusionSize
67 , fc_sampleBins :: SampleBins
68 , fc_clusters :: Clusters
69 , fc_defaultValue :: DefaultValue
70 }
71
72 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
73 filterCooc fc cc = (filterCooc' fc) ts cc
74 where
75 ts = map _scored_terms $ takeSome fc $ coocScored cc
76
77
78 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
79 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
80 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
81 M.empty selection
82 where
83 selection = [(x,y) | x <- ts
84 , y <- ts
85 -- , x >= y
86 ]
87
88
89 -- | Map list creation
90 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
91 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
92 -- each parts is then ordered by Inclusion/Exclusion
93 -- take n scored terms in each parts where n * SampleBins = MapListSize.
94 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
95 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
96 $ takeSample n m
97 $ L.take l' $ sortWith (Down . _scored_incExc) scores
98 -- $ splitKmeans k scores
99 where
100 -- TODO: benchmark with accelerate-example kmeans version
101 --splitKmeans x xs = L.concat $ map elements
102 -- $ V.take (k-1)
103 -- $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
104 -- euclidSq x xs
105 n = round ((fromIntegral l)/s)
106 m = round $ (fromIntegral $ length scores) / (s)
107 takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
108 L.concat $ map (L.take n')
109 $ map (sortWith (Down . _scored_incExc))
110 -- TODO use kmeans s instead of splitEvery
111 -- in order to split in s heteregenous parts
112 -- without homogeneous order hypothesis
113 $ splitEvery m'
114 $ sortWith (Down . _scored_speGen) xs
115
116
117 data Scored t = Scored { _scored_terms :: !t
118 , _scored_incExc :: !InclusionExclusion
119 , _scored_speGen :: !SpecificityGenericity
120 } deriving (Show)
121
122 -- TODO in the textflow we end up needing these indices, it might be better
123 -- to compute them earlier and pass them around.
124 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
125 coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
126 where
127 (ti,fi) = createIndices m
128 (is, ss) = incExcSpeGen $ cooc2mat ti m
129 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
149 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
150 where
151 (ti,fi) = createIndices m
152 ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
153
154
155
156 metrics_text :: Text
157 metrics_text = T.intercalate " " metrics_sentences
158
159 metrics_sentences' :: [Text]
160 metrics_sentences' = splitBy (Sentences 0) metrics_text
161
162 -- | Sentences
163 metrics_sentences :: [Text]
164 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
165 , "I can see the glass on the table."
166 , "There was only a spoon on that table."
167 , "The glass just fall from the table, pouring wine everywhere."
168 , "I wish the glass did not contain wine."
169 ]
170
171 metrics_sentences_Test :: Bool
172 metrics_sentences_Test = metrics_sentences == metrics_sentences'
173
174 -- | Terms reordered to visually check occurrences
175 -- >>>
176 {- [ [["table"],["glass"],["wine"],["spoon"]]
177 , [["glass"],["table"]]
178 , [["spoon"],["table"]]
179 , [["glass"],["table"],["wine"]]
180 , [["glass"],["wine"]]
181 ]
182 -}
183
184 metrics_terms :: IO [[Terms]]
185 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
186
187 -- | Occurrences
188 {-
189 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
190 , (fromList ["object"],fromList [(["object"], 3 )])
191 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
192 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
193 -}
194 metrics_occ :: IO (Map Grouped (Map Terms Int))
195 metrics_occ = occurrences <$> L.concat <$> metrics_terms
196
197 {-
198 -- fromList [((["glas"],["object"]),6)
199 ,((["glas"],["spoon"]),4)
200 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
201
202 -}
203 metrics_cooc :: IO (Map (Label, Label) Int)
204 metrics_cooc = cooc <$> metrics_terms
205
206 metrics_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
207 metrics_cooc_mat = do
208 m <- metrics_cooc
209 let (ti,_) = createIndices m
210 let mat_cooc = cooc2mat ti m
211 pure ( ti
212 , mat_cooc
213 , incExcSpeGen_proba mat_cooc
214 , incExcSpeGen mat_cooc
215 )
216
217 metrics_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)])
218 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
219