2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Mainly reexport functions in @Data.Text.Metrics@
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
22 module Gargantext.Text.Metrics
25 import Data.Text (Text, pack)
27 import qualified Data.List as L
28 import qualified Data.Map as M
29 import qualified Data.Set as S
30 import qualified Data.Text as T
31 import qualified Data.Vector as V
32 import qualified Data.Vector.Unboxed as VU
33 import Data.Tuple.Extra (both)
34 --import GHC.Real (Ratio)
35 --import qualified Data.Text.Metrics as DTM
36 import Data.Array.Accelerate (toList)
37 import Math.KMeans (kmeans, euclidSq, elements)
40 import Gargantext.Prelude
42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
44 import Gargantext.Core (Lang(EN))
45 import Gargantext.Core.Types (Terms(..))
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
48 import Gargantext.Viz.Graph.Distances.Matrice
49 import Gargantext.Viz.Graph.Index
51 import qualified Data.Array.Accelerate.Interpreter as DAA
52 import qualified Data.Array.Accelerate as DAA
54 import GHC.Real (round)
56 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
58 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
59 ----(ti, fi) = createIndices m
60 -- . fromIndex fi $ filterMat $ cooc2mat ti m
63 type MapListSize = Int
64 type SampleBins = Double
67 -- | Map list creation
68 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
69 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
70 -- each parts is then ordered by Inclusion/Exclusion
71 -- take n scored terms in each parts where n * SampleBins = MapListSize.
72 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
73 takeSome l s k scores = L.take l
75 $ splitKmeans k scores
77 -- TODO: benchmark with accelerate-example kmeans version
78 splitKmeans x xs = elements
80 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
82 n = round ((fromIntegral l)/s)
83 m = round $ (fromIntegral $ length scores) / (s)
84 takeSample n m xs = L.concat $ map (L.take n)
85 $ L.reverse $ map (L.sortOn _scored_incExc)
87 $ L.reverse $ L.sortOn _scored_speGen xs
90 data Scored t = Scored { _scored_terms :: t
91 , _scored_incExc :: InclusionExclusion
92 , _scored_speGen :: SpecificityGenericity
95 incExcSpeGen_sorted' :: (DAA.Elt t, Ord t) => Map (t,t) Int -> [Scored t]
96 incExcSpeGen_sorted' m = map (\(t,inc,spe) -> Scored t inc spe) scores
98 (ti,fi) = createIndices m
99 (is, ss) = incExcSpeGen $ cooc2mat ti m
100 scores = DAA.toList $ DAA.run $ DAA.zip3 (DAA.use ts) (DAA.use is) (DAA.use ss)
101 ts = DAA.fromList (DAA.arrayShape is) (snd <$> M.toAscList fi)
102 -- TODO fi should already be a Vector
105 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
106 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
108 (ti,fi) = createIndices m
109 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
115 metrics_text = T.intercalate " " metrics_sentences
117 metrics_sentences' :: [Text]
118 metrics_sentences' = splitBy (Sentences 0) metrics_text
121 metrics_sentences :: [Text]
122 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
123 , "I can see the glass on the table."
124 , "There was only a spoon on that table."
125 , "The glass just fall from the table, pouring wine everywhere."
126 , "I wish the glass did not contain wine."
129 metrics_sentences_Test = metrics_sentences == metrics_sentences'
131 -- | Terms reordered to visually check occurrences
133 {- [ [["table"],["glass"],["wine"],["spoon"]]
134 , [["glass"],["table"]]
135 , [["spoon"],["table"]]
136 , [["glass"],["table"],["wine"]]
137 , [["glass"],["wine"]]
141 metrics_terms :: IO [[Terms]]
142 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
146 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
147 , (fromList ["object"],fromList [(["object"], 3 )])
148 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
149 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
151 metrics_occ = occurrences <$> L.concat <$> metrics_terms
154 -- fromList [((["glas"],["object"]),6)
155 ,((["glas"],["spoon"]),4)
156 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
159 metrics_cooc = cooc <$> metrics_terms
161 metrics_cooc_mat = do
163 let (ti,_) = createIndices m
164 let mat_cooc = cooc2mat ti m
167 , incExcSpeGen_proba mat_cooc
168 , incExcSpeGen mat_cooc
171 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc