]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[MERGE] bashql + pipeline.
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22
23 module Gargantext.Text.Metrics
24 where
25
26 import Data.Text (Text, pack)
27 import Data.Ord (comparing, Down(..))
28 import Data.Map (Map)
29 import qualified Data.List as L
30 import qualified Data.Map as M
31 import qualified Data.Set as S
32 import qualified Data.Text as T
33 import qualified Data.Vector as V
34 import qualified Data.Vector.Unboxed as VU
35 import Data.Tuple.Extra (both)
36 --import GHC.Real (Ratio)
37 --import qualified Data.Text.Metrics as DTM
38 import Data.Array.Accelerate (toList)
39 import Math.KMeans (kmeans, euclidSq, elements)
40
41
42 import Gargantext.Prelude
43
44 import Gargantext.Text.Metrics.Count (occurrences, cooc)
45 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
46 import Gargantext.Core (Lang(EN))
47 import Gargantext.Core.Types (Terms(..))
48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
49
50 import Gargantext.Viz.Graph.Distances.Matrice
51 import Gargantext.Viz.Graph.Index
52
53 import qualified Data.Array.Accelerate.Interpreter as DAA
54 import qualified Data.Array.Accelerate as DAA
55 -- import Data.Array.Accelerate ((:.)(..), Z(..))
56
57 import GHC.Real (round)
58
59 import Debug.Trace
60 import Prelude (seq)
61
62 data MapListSize = MapListSize Int
63 data InclusionSize = InclusionSize Int
64 data SampleBins = SampleBins Double
65 data Clusters = Clusters Int
66 data DefaultValue = DefaultValue Int
67
68 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
69 , fc_inclusionSize :: InclusionSize
70 , fc_sampleBins :: SampleBins
71 , fc_clusters :: Clusters
72 , fc_defaultValue :: DefaultValue
73 }
74
75 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
76 filterCooc fc cc = (filterCooc' fc) ts cc
77 where
78 ts = map _scored_terms $ takeSome fc $ coocScored cc
79
80
81 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
82 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
83 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
84 M.empty selection
85 where
86 selection = [(x,y) | x <- ts
87 , y <- ts
88 -- , x >= y
89 ]
90
91
92 -- | Map list creation
93 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
94 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
95 -- each parts is then ordered by Inclusion/Exclusion
96 -- take n scored terms in each parts where n * SampleBins = MapListSize.
97 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
98 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
99 $ takeSample n m
100 $ L.take l' $ sortWith (Down . _scored_incExc) scores
101 -- $ splitKmeans k scores
102 where
103 -- TODO: benchmark with accelerate-example kmeans version
104 splitKmeans x xs = L.concat $ map elements
105 $ V.take (k-1)
106 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
107 euclidSq x xs
108 n = round ((fromIntegral l)/s)
109 m = round $ (fromIntegral $ length scores) / (s)
110 takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
111 L.concat $ map (L.take n)
112 $ map (sortWith (Down . _scored_incExc))
113 -- TODO use kmeans s instead of splitEvery
114 -- in order to split in s heteregenous parts
115 -- without homogeneous order hypothesis
116 $ splitEvery m
117 $ sortWith (Down . _scored_speGen) xs
118
119
120 data Scored t = Scored { _scored_terms :: !t
121 , _scored_incExc :: !InclusionExclusion
122 , _scored_speGen :: !SpecificityGenericity
123 } deriving (Show)
124
125 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
126 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
127 where
128 (ti,fi) = createIndices m
129 (is, ss) = incExcSpeGen $ cooc2mat ti m
130 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
150 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
151 where
152 (ti,fi) = createIndices m
153 ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
154
155
156
157 metrics_text :: Text
158 metrics_text = T.intercalate " " metrics_sentences
159
160 metrics_sentences' :: [Text]
161 metrics_sentences' = splitBy (Sentences 0) metrics_text
162
163 -- | Sentences
164 metrics_sentences :: [Text]
165 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
166 , "I can see the glass on the table."
167 , "There was only a spoon on that table."
168 , "The glass just fall from the table, pouring wine everywhere."
169 , "I wish the glass did not contain wine."
170 ]
171
172 metrics_sentences_Test = metrics_sentences == metrics_sentences'
173
174 -- | Terms reordered to visually check occurrences
175 -- >>>
176 {- [ [["table"],["glass"],["wine"],["spoon"]]
177 , [["glass"],["table"]]
178 , [["spoon"],["table"]]
179 , [["glass"],["table"],["wine"]]
180 , [["glass"],["wine"]]
181 ]
182 -}
183
184 metrics_terms :: IO [[Terms]]
185 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
186
187 -- | Occurrences
188 {-
189 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
190 , (fromList ["object"],fromList [(["object"], 3 )])
191 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
192 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
193 -}
194 metrics_occ = occurrences <$> L.concat <$> metrics_terms
195
196 {-
197 -- fromList [((["glas"],["object"]),6)
198 ,((["glas"],["spoon"]),4)
199 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
200
201 -}
202 metrics_cooc = cooc <$> metrics_terms
203
204 metrics_cooc_mat = do
205 m <- metrics_cooc
206 let (ti,_) = createIndices m
207 let mat_cooc = cooc2mat ti m
208 pure ( ti
209 , mat_cooc
210 , incExcSpeGen_proba mat_cooc
211 , incExcSpeGen mat_cooc
212 )
213
214 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
215