]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
Some progress on workflow, the data2graph fails because of unknown nodes, see the...
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22
23 module Gargantext.Text.Metrics
24 where
25
26 import Data.Text (Text, pack)
27 import Data.Map (Map)
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import qualified Data.Vector as V
33 import qualified Data.Vector.Unboxed as VU
34 import Data.Tuple.Extra (both)
35 --import GHC.Real (Ratio)
36 --import qualified Data.Text.Metrics as DTM
37 import Data.Array.Accelerate (toList)
38 import Math.KMeans (kmeans, euclidSq, elements)
39
40
41 import Gargantext.Prelude
42
43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
45 import Gargantext.Core (Lang(EN))
46 import Gargantext.Core.Types (Terms(..))
47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
48
49 import Gargantext.Viz.Graph.Distances.Matrice
50 import Gargantext.Viz.Graph.Index
51
52 import qualified Data.Array.Accelerate.Interpreter as DAA
53 import qualified Data.Array.Accelerate as DAA
54 -- import Data.Array.Accelerate ((:.)(..), Z(..))
55
56 import GHC.Real (round)
57
58 import Debug.Trace
59 import Prelude (seq)
60
61 filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
62 filterCooc cc = filterCooc' ts cc
63 where
64 ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
65
66 filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
67 filterCooc' ts m = -- trace ("coocScored " <> show (length ts)) $
68 foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m')
69 M.empty selection
70 where
71 errMessage = panic "Filter cooc: no key"
72 selection = [(x,y) | x <- ts, y <- ts, x > y]
73
74
75 type MapListSize = Int
76 type SampleBins = Double
77 type Clusters = Int
78
79 -- | Map list creation
80 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
81 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
82 -- each parts is then ordered by Inclusion/Exclusion
83 -- take n scored terms in each parts where n * SampleBins = MapListSize.
84 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
85 takeSome l s k scores = L.take l
86 $ takeSample n m
87 $ splitKmeans k scores
88 where
89 -- TODO: benchmark with accelerate-example kmeans version
90 splitKmeans x xs = elements
91 $ V.head
92 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
93 euclidSq x xs
94 n = round ((fromIntegral l)/s)
95 m = round $ (fromIntegral $ length scores) / (s)
96 takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
97 L.concat $ map (L.take n)
98 $ L.reverse $ map (L.sortOn _scored_incExc)
99 -- TODO use kmeans s instead of splitEvery
100 -- in order to split in s heteregenous parts
101 -- without homogeneous order hypothesis
102 $ splitEvery m
103 $ L.reverse $ L.sortOn _scored_speGen xs
104
105
106 data Scored t = Scored { _scored_terms :: !t
107 , _scored_incExc :: !InclusionExclusion
108 , _scored_speGen :: !SpecificityGenericity
109 } deriving (Show)
110
111 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
112 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
113 where
114 (ti,fi) = createIndices m
115 (is, ss) = incExcSpeGen $ cooc2mat ti m
116 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
138 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
139 where
140 (ti,fi) = createIndices m
141 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
142
143
144
145 metrics_text :: Text
146 metrics_text = T.intercalate " " metrics_sentences
147
148 metrics_sentences' :: [Text]
149 metrics_sentences' = splitBy (Sentences 0) metrics_text
150
151 -- | Sentences
152 metrics_sentences :: [Text]
153 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
154 , "I can see the glass on the table."
155 , "There was only a spoon on that table."
156 , "The glass just fall from the table, pouring wine everywhere."
157 , "I wish the glass did not contain wine."
158 ]
159
160 metrics_sentences_Test = metrics_sentences == metrics_sentences'
161
162 -- | Terms reordered to visually check occurrences
163 -- >>>
164 {- [ [["table"],["glass"],["wine"],["spoon"]]
165 , [["glass"],["table"]]
166 , [["spoon"],["table"]]
167 , [["glass"],["table"],["wine"]]
168 , [["glass"],["wine"]]
169 ]
170 -}
171
172 metrics_terms :: IO [[Terms]]
173 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
174
175 -- | Occurrences
176 {-
177 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
178 , (fromList ["object"],fromList [(["object"], 3 )])
179 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
180 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
181 -}
182 metrics_occ = occurrences <$> L.concat <$> metrics_terms
183
184 {-
185 -- fromList [((["glas"],["object"]),6)
186 ,((["glas"],["spoon"]),4)
187 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
188
189 -}
190 metrics_cooc = cooc <$> metrics_terms
191
192 metrics_cooc_mat = do
193 m <- metrics_cooc
194 let (ti,_) = createIndices m
195 let mat_cooc = cooc2mat ti m
196 pure ( ti
197 , mat_cooc
198 , incExcSpeGen_proba mat_cooc
199 , incExcSpeGen mat_cooc
200 )
201
202 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
203