]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[Map list automatic filtering] adding doc.
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21
22 module Gargantext.Text.Metrics
23 where
24
25 import Data.Text (Text, pack)
26 import Data.Map (Map)
27 import qualified Data.List as L
28 import qualified Data.Map as M
29 import qualified Data.Set as S
30 import qualified Data.Text as T
31 import qualified Data.Vector as V
32 import qualified Data.Vector.Unboxed as VU
33 import Data.Tuple.Extra (both)
34 --import GHC.Real (Ratio)
35 --import qualified Data.Text.Metrics as DTM
36 import Data.Array.Accelerate (toList)
37 import Math.KMeans (kmeans, euclidSq, elements)
38
39
40 import Gargantext.Prelude
41
42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
44 import Gargantext.Core (Lang(EN))
45 import Gargantext.Core.Types (Terms(..))
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
47
48 import Gargantext.Viz.Graph.Distances.Matrice
49 import Gargantext.Viz.Graph.Index
50
51 import qualified Data.Array.Accelerate.Interpreter as DAA
52 import qualified Data.Array.Accelerate as DAA
53
54 import GHC.Real (round)
55
56 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
57 --filterCooc m =
58 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
59 ----(ti, fi) = createIndices m
60 -- . fromIndex fi $ filterMat $ cooc2mat ti m
61
62
63 type MapListSize = Int
64 type SampleBins = Double
65 type Clusters = Int
66
67 -- | Map list creation
68 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
69 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
70 -- each parts is then ordered by Inclusion/Exclusion
71 -- take n scored terms in each parts where n * SampleBins = MapListSize.
72 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
73 takeSome l s k scores = L.take l
74 $ takeSample n m
75 $ splitKmeans k scores
76 where
77 -- TODO: benchmark with accelerate-example kmeans version
78 splitKmeans x xs = elements
79 $ V.head
80 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
81 euclidSq x xs
82 n = round ((fromIntegral l)/s)
83 m = round $ (fromIntegral $ length scores) / (s)
84 takeSample n m xs = L.concat $ map (L.take n)
85 $ L.reverse $ map (L.sortOn _scored_incExc)
86 $ splitEvery m
87 $ L.reverse $ L.sortOn _scored_speGen xs
88
89
90 data Scored t = Scored { _scored_terms :: t
91 , _scored_incExc :: InclusionExclusion
92 , _scored_speGen :: SpecificityGenericity
93 } deriving (Show)
94
95 incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
96 incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
97 where
98 (ti,fi) = createIndices m
99 (is, ss) = incExcSpeGen $ cooc2mat ti m
100 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
101
102
103 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
104 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
105 where
106 (ti,fi) = createIndices m
107 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
108
109
110
111
112 metrics_text :: Text
113 metrics_text = T.intercalate " " metrics_sentences
114
115 metrics_sentences' :: [Text]
116 metrics_sentences' = splitBy (Sentences 0) metrics_text
117
118 -- | Sentences
119 metrics_sentences :: [Text]
120 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
121 , "I can see the glass on the table."
122 , "There was only a spoon on that table."
123 , "The glass just fall from the table, pouring wine everywhere."
124 , "I wish the glass did not contain wine."
125 ]
126
127 metrics_sentences_Test = metrics_sentences == metrics_sentences'
128
129 -- | Terms reordered to visually check occurrences
130 -- >>>
131 {- [ [["table"],["glass"],["wine"],["spoon"]]
132 , [["glass"],["table"]]
133 , [["spoon"],["table"]]
134 , [["glass"],["table"],["wine"]]
135 , [["glass"],["wine"]]
136 ]
137 -}
138
139 metrics_terms :: IO [[Terms]]
140 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
141
142 -- | Occurrences
143 {-
144 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
145 , (fromList ["object"],fromList [(["object"], 3 )])
146 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
147 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
148 -}
149 metrics_occ = occurrences <$> L.concat <$> metrics_terms
150
151 {-
152 -- fromList [((["glas"],["object"]),6)
153 ,((["glas"],["spoon"]),4)
154 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
155
156 -}
157 metrics_cooc = cooc <$> metrics_terms
158
159 metrics_cooc_mat = do
160 m <- metrics_cooc
161 let (ti,_) = createIndices m
162 let mat_cooc = cooc2mat ti m
163 pure ( ti
164 , mat_cooc
165 , incExcSpeGen_proba mat_cooc
166 , incExcSpeGen mat_cooc
167 )
168
169 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
170