2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Mainly reexport functions in @Data.Text.Metrics@
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
22 module Gargantext.Text.Metrics
25 import Data.Text (Text, pack)
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import Data.Tuple.Extra (both)
33 --import GHC.Real (Ratio)
34 --import qualified Data.Text.Metrics as DTM
35 import Data.Array.Accelerate (toList)
38 import Gargantext.Prelude
40 import Gargantext.Text.Metrics.Count (occurrences, cooc)
41 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
42 import Gargantext.Core (Lang(EN))
43 import Gargantext.Core.Types (Terms(..))
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
46 import Gargantext.Viz.Graph.Distances.Matrice
47 import Gargantext.Viz.Graph.Index
49 import qualified Data.Array.Accelerate.Interpreter as DAA
50 import qualified Data.Array.Accelerate as DAA
52 import GHC.Real (round)
54 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
56 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
57 ----(ti, fi) = createIndices m
58 -- . fromIndex fi $ filterMat $ cooc2mat ti m
64 takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
65 takeSome l s scores = L.take l
68 $ L.reverse $ L.sortOn _scored_incExc scores
70 -- TODO : KMEAN split into 2 main clusters
71 -- (advice: use accelerate-example kmeans version
72 -- and maybe benchmark it to be sure)
75 n = round ((fromIntegral l)/s)
76 m = round $ (fromIntegral $ length scores) / (s)
77 takeSample n m xs = L.concat $ map (L.take n)
78 $ L.reverse $ map (L.sortOn _scored_incExc)
80 $ L.reverse $ L.sortOn _scored_speGen xs
83 data Scored t = Scored { _scored_terms :: t
84 , _scored_incExc :: InclusionExclusion
85 , _scored_speGen :: SpecificityGenericity
88 incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
89 incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
91 (ti,fi) = createIndices m
92 (is, ss) = incExcSpeGen $ cooc2mat ti m
93 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
96 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
97 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
99 (ti,fi) = createIndices m
100 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
106 metrics_text = T.intercalate " " metrics_sentences
108 metrics_sentences' :: [Text]
109 metrics_sentences' = splitBy (Sentences 0) metrics_text
112 metrics_sentences :: [Text]
113 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
114 , "I can see the glass on the table."
115 , "There was only a spoon on that table."
116 , "The glass just fall from the table, pouring wine everywhere."
117 , "I wish the glass did not contain wine."
120 metrics_sentences_Test = metrics_sentences == metrics_sentences'
122 -- | Terms reordered to visually check occurrences
124 {- [ [["table"],["glass"],["wine"],["spoon"]]
125 , [["glass"],["table"]]
126 , [["spoon"],["table"]]
127 , [["glass"],["table"],["wine"]]
128 , [["glass"],["wine"]]
132 metrics_terms :: IO [[Terms]]
133 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
137 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
138 , (fromList ["object"],fromList [(["object"], 3 )])
139 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
140 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
142 metrics_occ = occurrences <$> L.concat <$> metrics_terms
145 -- fromList [((["glas"],["object"]),6)
146 ,((["glas"],["spoon"]),4)
147 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
150 metrics_cooc = cooc <$> metrics_terms
152 metrics_cooc_mat = do
154 let (ti,_) = createIndices m
155 let mat_cooc = cooc2mat ti m
158 , incExcSpeGen_proba mat_cooc
159 , incExcSpeGen mat_cooc
162 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc