2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Mainly reexport functions in @Data.Text.Metrics@
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
19 {-# LANGUAGE BangPatterns #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
23 module Gargantext.Text.Metrics
26 import Data.Text (Text, pack)
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import qualified Data.Vector as V
33 import qualified Data.Vector.Unboxed as VU
34 import Data.Tuple.Extra (both)
35 --import GHC.Real (Ratio)
36 --import qualified Data.Text.Metrics as DTM
37 import Data.Array.Accelerate (toList)
38 import Math.KMeans (kmeans, euclidSq, elements)
41 import Gargantext.Prelude
43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
45 import Gargantext.Core (Lang(EN))
46 import Gargantext.Core.Types (Terms(..))
47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
49 import Gargantext.Viz.Graph.Distances.Matrice
50 import Gargantext.Viz.Graph.Index
52 import qualified Data.Array.Accelerate.Interpreter as DAA
53 import qualified Data.Array.Accelerate as DAA
54 -- import Data.Array.Accelerate ((:.)(..), Z(..))
56 import GHC.Real (round)
61 data MapListSize = MapListSize Int
62 data InclusionSize = InclusionSize Int
63 data SampleBins = SampleBins Double
64 data Clusters = Clusters Int
65 data DefaultValue = DefaultValue Int
67 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
68 , fc_inclusionSize :: InclusionSize
69 , fc_sampleBins :: SampleBins
70 , fc_clusters :: Clusters
71 , fc_defaultValue :: DefaultValue
74 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
75 filterCooc fc cc = (filterCooc' fc) ts cc
77 ts = map _scored_terms $ takeSome fc $ coocScored cc
80 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
81 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
82 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
85 selection = [(x,y) | x <- ts, y <- ts, x > y]
88 -- | Map list creation
89 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
90 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
91 -- each parts is then ordered by Inclusion/Exclusion
92 -- take n scored terms in each parts where n * SampleBins = MapListSize.
93 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
94 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
96 $ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
97 -- $ splitKmeans k scores
99 -- TODO: benchmark with accelerate-example kmeans version
100 splitKmeans x xs = L.concat $ map elements
102 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
104 n = round ((fromIntegral l)/s)
105 m = round $ (fromIntegral $ length scores) / (s)
106 takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
107 L.concat $ map (L.take n)
108 $ map (reverse . (L.sortOn _scored_incExc))
109 -- TODO use kmeans s instead of splitEvery
110 -- in order to split in s heteregenous parts
111 -- without homogeneous order hypothesis
113 $ L.reverse $ L.sortOn _scored_speGen xs
116 data Scored t = Scored { _scored_terms :: !t
117 , _scored_incExc :: !InclusionExclusion
118 , _scored_speGen :: !SpecificityGenericity
121 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
122 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
124 (ti,fi) = createIndices m
125 (is, ss) = incExcSpeGen $ cooc2mat ti m
126 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
147 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
148 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
150 (ti,fi) = createIndices m
151 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
156 metrics_text = T.intercalate " " metrics_sentences
158 metrics_sentences' :: [Text]
159 metrics_sentences' = splitBy (Sentences 0) metrics_text
162 metrics_sentences :: [Text]
163 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
164 , "I can see the glass on the table."
165 , "There was only a spoon on that table."
166 , "The glass just fall from the table, pouring wine everywhere."
167 , "I wish the glass did not contain wine."
170 metrics_sentences_Test = metrics_sentences == metrics_sentences'
172 -- | Terms reordered to visually check occurrences
174 {- [ [["table"],["glass"],["wine"],["spoon"]]
175 , [["glass"],["table"]]
176 , [["spoon"],["table"]]
177 , [["glass"],["table"],["wine"]]
178 , [["glass"],["wine"]]
182 metrics_terms :: IO [[Terms]]
183 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
187 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
188 , (fromList ["object"],fromList [(["object"], 3 )])
189 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
190 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
192 metrics_occ = occurrences <$> L.concat <$> metrics_terms
195 -- fromList [((["glas"],["object"]),6)
196 ,((["glas"],["spoon"]),4)
197 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
200 metrics_cooc = cooc <$> metrics_terms
202 metrics_cooc_mat = do
204 let (ti,_) = createIndices m
205 let mat_cooc = cooc2mat ti m
208 , incExcSpeGen_proba mat_cooc
209 , incExcSpeGen mat_cooc
212 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc