]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[MERGE] needs fix.
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22
23 module Gargantext.Text.Metrics
24 where
25
26 import Data.Text (Text, pack)
27 import Data.Map (Map)
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import qualified Data.Vector as V
33 import qualified Data.Vector.Unboxed as VU
34 import Data.Tuple.Extra (both)
35 --import GHC.Real (Ratio)
36 --import qualified Data.Text.Metrics as DTM
37 import Data.Array.Accelerate (toList)
38 import Math.KMeans (kmeans, euclidSq, elements)
39
40
41 import Gargantext.Prelude
42
43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
45 import Gargantext.Core (Lang(EN))
46 import Gargantext.Core.Types (Terms(..))
47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
48
49 import Gargantext.Viz.Graph.Distances.Matrice
50 import Gargantext.Viz.Graph.Index
51
52 import qualified Data.Array.Accelerate.Interpreter as DAA
53 import qualified Data.Array.Accelerate as DAA
54
55 import GHC.Real (round)
56
57 filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
58 filterCooc cc = filterCooc' ts cc
59 where
60 ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
61
62 filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
63 filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
64 where
65 errMessage = panic "Filter cooc: no key"
66 selection = [(x,y) | x <- ts, y <- ts, x > y]
67
68
69 type MapListSize = Int
70 type SampleBins = Double
71 type Clusters = Int
72
73 -- | Map list creation
74 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
75 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
76 -- each parts is then ordered by Inclusion/Exclusion
77 -- take n scored terms in each parts where n * SampleBins = MapListSize.
78 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
79 takeSome l s k scores = L.take l
80 $ takeSample n m
81 $ splitKmeans k scores
82 where
83 -- TODO: benchmark with accelerate-example kmeans version
84 splitKmeans x xs = elements
85 $ V.head
86 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
87 euclidSq x xs
88 n = round ((fromIntegral l)/s)
89 m = round $ (fromIntegral $ length scores) / (s)
90 takeSample n m xs = L.concat $ map (L.take n)
91 $ L.reverse $ map (L.sortOn _scored_incExc)
92 -- TODO use kmeans s instead of splitEvery
93 -- in order to split in s heteregenous parts
94 -- without homogeneous order hypothesis
95 $ splitEvery m
96 $ L.reverse $ L.sortOn _scored_speGen xs
97
98
99 data Scored t = Scored { _scored_terms :: !t
100 , _scored_incExc :: !InclusionExclusion
101 , _scored_speGen :: !SpecificityGenericity
102 } deriving (Show)
103
104 --coocScored :: Ord t => Map (t,t) Int -> [Scored t]
105 --coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
106 coocScored :: (DAA.Elt t, Ord t) => Map (t,t) Int -> [Scored t]
107 coocScored m = map (\(t,inc,spe) -> Scored t inc spe) scores
108 where
109 (ti,fi) = createIndices m
110 (is, ss) = incExcSpeGen $ cooc2mat ti m
111 scores = DAA.toList $ DAA.run $ DAA.zip3 (DAA.use ts) (DAA.use is) (DAA.use ss)
112 ts = DAA.fromList (DAA.arrayShape is) (snd <$> M.toAscList fi)
113 -- TODO fi should already be a Vector
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
135 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
136 where
137 (ti,fi) = createIndices m
138 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
139
140
141
142 metrics_text :: Text
143 metrics_text = T.intercalate " " metrics_sentences
144
145 metrics_sentences' :: [Text]
146 metrics_sentences' = splitBy (Sentences 0) metrics_text
147
148 -- | Sentences
149 metrics_sentences :: [Text]
150 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
151 , "I can see the glass on the table."
152 , "There was only a spoon on that table."
153 , "The glass just fall from the table, pouring wine everywhere."
154 , "I wish the glass did not contain wine."
155 ]
156
157 metrics_sentences_Test = metrics_sentences == metrics_sentences'
158
159 -- | Terms reordered to visually check occurrences
160 -- >>>
161 {- [ [["table"],["glass"],["wine"],["spoon"]]
162 , [["glass"],["table"]]
163 , [["spoon"],["table"]]
164 , [["glass"],["table"],["wine"]]
165 , [["glass"],["wine"]]
166 ]
167 -}
168
169 metrics_terms :: IO [[Terms]]
170 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
171
172 -- | Occurrences
173 {-
174 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
175 , (fromList ["object"],fromList [(["object"], 3 )])
176 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
177 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
178 -}
179 metrics_occ = occurrences <$> L.concat <$> metrics_terms
180
181 {-
182 -- fromList [((["glas"],["object"]),6)
183 ,((["glas"],["spoon"]),4)
184 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
185
186 -}
187 metrics_cooc = cooc <$> metrics_terms
188
189 metrics_cooc_mat = do
190 m <- metrics_cooc
191 let (ti,_) = createIndices m
192 let mat_cooc = cooc2mat ti m
193 pure ( ti
194 , mat_cooc
195 , incExcSpeGen_proba mat_cooc
196 , incExcSpeGen mat_cooc
197 )
198
199 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
200