2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Mainly reexport functions in @Data.Text.Metrics@
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
23 module Gargantext.Text.Metrics
26 import Data.Text (Text, pack)
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import qualified Data.Vector as V
33 import qualified Data.Vector.Unboxed as VU
34 import Data.Tuple.Extra (both)
35 --import GHC.Real (Ratio)
36 --import qualified Data.Text.Metrics as DTM
37 import Data.Array.Accelerate (toList)
38 import Math.KMeans (kmeans, euclidSq, elements)
41 import Gargantext.Prelude
43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
45 import Gargantext.Core (Lang(EN))
46 import Gargantext.Core.Types (Terms(..))
47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
49 import Gargantext.Viz.Graph.Distances.Matrice
50 import Gargantext.Viz.Graph.Index
52 import qualified Data.Array.Accelerate.Interpreter as DAA
53 import qualified Data.Array.Accelerate as DAA
55 import GHC.Real (round)
57 filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
58 filterCooc cc = filterCooc' ts cc
60 ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
62 filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
63 filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
65 errMessage = panic "Filter cooc: no key"
66 selection = [(x,y) | x <- ts, y <- ts, x > y]
69 type MapListSize = Int
70 type SampleBins = Double
73 -- | Map list creation
74 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
75 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
76 -- each parts is then ordered by Inclusion/Exclusion
77 -- take n scored terms in each parts where n * SampleBins = MapListSize.
78 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
79 takeSome l s k scores = L.take l
81 $ splitKmeans k scores
83 -- TODO: benchmark with accelerate-example kmeans version
84 splitKmeans x xs = elements
86 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
88 n = round ((fromIntegral l)/s)
89 m = round $ (fromIntegral $ length scores) / (s)
90 takeSample n m xs = L.concat $ map (L.take n)
91 $ L.reverse $ map (L.sortOn _scored_incExc)
92 -- TODO use kmeans s instead of splitEvery
93 -- in order to split in s heteregenous parts
94 -- without homogeneous order hypothesis
96 $ L.reverse $ L.sortOn _scored_speGen xs
99 data Scored t = Scored { _scored_terms :: !t
100 , _scored_incExc :: !InclusionExclusion
101 , _scored_speGen :: !SpecificityGenericity
104 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
105 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
107 (ti,fi) = createIndices m
108 (is, ss) = incExcSpeGen $ cooc2mat ti m
109 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
130 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
131 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
133 (ti,fi) = createIndices m
134 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
139 metrics_text = T.intercalate " " metrics_sentences
141 metrics_sentences' :: [Text]
142 metrics_sentences' = splitBy (Sentences 0) metrics_text
145 metrics_sentences :: [Text]
146 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
147 , "I can see the glass on the table."
148 , "There was only a spoon on that table."
149 , "The glass just fall from the table, pouring wine everywhere."
150 , "I wish the glass did not contain wine."
153 metrics_sentences_Test = metrics_sentences == metrics_sentences'
155 -- | Terms reordered to visually check occurrences
157 {- [ [["table"],["glass"],["wine"],["spoon"]]
158 , [["glass"],["table"]]
159 , [["spoon"],["table"]]
160 , [["glass"],["table"],["wine"]]
161 , [["glass"],["wine"]]
165 metrics_terms :: IO [[Terms]]
166 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
170 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
171 , (fromList ["object"],fromList [(["object"], 3 )])
172 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
173 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
175 metrics_occ = occurrences <$> L.concat <$> metrics_terms
178 -- fromList [((["glas"],["object"]),6)
179 ,((["glas"],["spoon"]),4)
180 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
183 metrics_cooc = cooc <$> metrics_terms
185 metrics_cooc_mat = do
187 let (ti,_) = createIndices m
188 let mat_cooc = cooc2mat ti m
191 , incExcSpeGen_proba mat_cooc
192 , incExcSpeGen mat_cooc
195 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc