2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Mainly reexport functions in @Data.Text.Metrics@
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
23 module Gargantext.Text.Metrics
26 import Data.Text (Text, pack)
27 import Data.Ord (comparing, Down(..))
29 import qualified Data.List as L
30 import qualified Data.Map as M
31 import qualified Data.Set as S
32 import qualified Data.Text as T
33 import qualified Data.Vector as V
34 import qualified Data.Vector.Unboxed as VU
35 import Data.Tuple.Extra (both)
36 --import GHC.Real (Ratio)
37 --import qualified Data.Text.Metrics as DTM
38 import Data.Array.Accelerate (toList)
39 import Math.KMeans (kmeans, euclidSq, elements)
42 import Gargantext.Prelude
44 import Gargantext.Text.Metrics.Count (occurrences, cooc)
45 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
46 import Gargantext.Core (Lang(EN))
47 import Gargantext.Core.Types (Terms(..))
48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
50 import Gargantext.Viz.Graph.Distances.Matrice
51 import Gargantext.Viz.Graph.Index
53 import qualified Data.Array.Accelerate.Interpreter as DAA
54 import qualified Data.Array.Accelerate as DAA
55 -- import Data.Array.Accelerate ((:.)(..), Z(..))
57 import GHC.Real (round)
62 data MapListSize = MapListSize Int
63 data InclusionSize = InclusionSize Int
64 data SampleBins = SampleBins Double
65 data Clusters = Clusters Int
66 data DefaultValue = DefaultValue Int
68 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
69 , fc_inclusionSize :: InclusionSize
70 , fc_sampleBins :: SampleBins
71 , fc_clusters :: Clusters
72 , fc_defaultValue :: DefaultValue
75 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
76 filterCooc fc cc = (filterCooc' fc) ts cc
78 ts = map _scored_terms $ takeSome fc $ coocScored cc
81 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
82 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
83 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
86 selection = [(x,y) | x <- ts
92 -- | Map list creation
93 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
94 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
95 -- each parts is then ordered by Inclusion/Exclusion
96 -- take n scored terms in each parts where n * SampleBins = MapListSize.
97 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
98 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
100 $ L.take l' $ sortWith (Down . _scored_incExc) scores
101 -- $ splitKmeans k scores
103 -- TODO: benchmark with accelerate-example kmeans version
104 splitKmeans x xs = L.concat $ map elements
106 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
108 n = round ((fromIntegral l)/s)
109 m = round $ (fromIntegral $ length scores) / (s)
110 takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
111 L.concat $ map (L.take n)
112 $ map (sortWith (Down . _scored_incExc))
113 -- TODO use kmeans s instead of splitEvery
114 -- in order to split in s heteregenous parts
115 -- without homogeneous order hypothesis
117 $ sortWith (Down . _scored_speGen) xs
120 data Scored t = Scored { _scored_terms :: !t
121 , _scored_incExc :: !InclusionExclusion
122 , _scored_speGen :: !SpecificityGenericity
125 -- TODO in the textflow we end up needing these indices, it might be better
126 -- to compute them earlier and pass them around.
127 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
128 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
130 (ti,fi) = createIndices m
131 (is, ss) = incExcSpeGen $ cooc2mat ti m
132 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
151 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
152 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
154 (ti,fi) = createIndices m
155 ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
160 metrics_text = T.intercalate " " metrics_sentences
162 metrics_sentences' :: [Text]
163 metrics_sentences' = splitBy (Sentences 0) metrics_text
166 metrics_sentences :: [Text]
167 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
168 , "I can see the glass on the table."
169 , "There was only a spoon on that table."
170 , "The glass just fall from the table, pouring wine everywhere."
171 , "I wish the glass did not contain wine."
174 metrics_sentences_Test = metrics_sentences == metrics_sentences'
176 -- | Terms reordered to visually check occurrences
178 {- [ [["table"],["glass"],["wine"],["spoon"]]
179 , [["glass"],["table"]]
180 , [["spoon"],["table"]]
181 , [["glass"],["table"],["wine"]]
182 , [["glass"],["wine"]]
186 metrics_terms :: IO [[Terms]]
187 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
191 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
192 , (fromList ["object"],fromList [(["object"], 3 )])
193 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
194 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
196 metrics_occ = occurrences <$> L.concat <$> metrics_terms
199 -- fromList [((["glas"],["object"]),6)
200 ,((["glas"],["spoon"]),4)
201 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
204 metrics_cooc = cooc <$> metrics_terms
206 metrics_cooc_mat = do
208 let (ti,_) = createIndices m
209 let mat_cooc = cooc2mat ti m
212 , incExcSpeGen_proba mat_cooc
213 , incExcSpeGen mat_cooc
216 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc