]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
Merge branch 'pipeline'
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21
22 module Gargantext.Text.Metrics
23 where
24
25 import Data.Text (Text, pack)
26 import Data.Map (Map)
27 import qualified Data.List as L
28 import qualified Data.Map as M
29 import qualified Data.Set as S
30 import qualified Data.Text as T
31 import qualified Data.Vector as V
32 import qualified Data.Vector.Unboxed as VU
33 import Data.Tuple.Extra (both)
34 --import GHC.Real (Ratio)
35 --import qualified Data.Text.Metrics as DTM
36 import Data.Array.Accelerate (toList)
37 import Math.KMeans (kmeans, euclidSq, elements)
38
39
40 import Gargantext.Prelude
41
42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
44 import Gargantext.Core (Lang(EN))
45 import Gargantext.Core.Types (Terms(..))
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
47
48 import Gargantext.Viz.Graph.Distances.Matrice
49 import Gargantext.Viz.Graph.Index
50
51 import qualified Data.Array.Accelerate.Interpreter as DAA
52 import qualified Data.Array.Accelerate as DAA
53
54 import GHC.Real (round)
55
56 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
57 --filterCooc m =
58 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
59 ----(ti, fi) = createIndices m
60 -- . fromIndex fi $ filterMat $ cooc2mat ti m
61
62
63 type ListSize = Int
64 type BinSize = Double
65
66 -- Map list creation
67 -- Kmean split into 2 main clusters with Inclusion/Exclusion (relevance score)
68 -- Sample the main cluster ordered by specificity/genericity in s parts
69 -- each parts is then ordered by Inclusion/Exclusion
70 -- take n scored terms in each parts where n * s = l
71 takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
72 takeSome l s scores = L.take l
73 $ takeSample n m
74 $ splitKmeans 2 scores
75 where
76 -- (TODO: benchmark with accelerate-example kmeans version)
77 splitKmeans x xs = elements
78 $ V.head
79 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
80 euclidSq x xs
81 n = round ((fromIntegral l)/s)
82 m = round $ (fromIntegral $ length scores) / (s)
83 takeSample n m xs = L.concat $ map (L.take n)
84 $ L.reverse $ map (L.sortOn _scored_incExc)
85 $ splitEvery m
86 $ L.reverse $ L.sortOn _scored_speGen xs
87
88
89 data Scored t = Scored { _scored_terms :: t
90 , _scored_incExc :: InclusionExclusion
91 , _scored_speGen :: SpecificityGenericity
92 } deriving (Show)
93
94 incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
95 incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
96 where
97 (ti,fi) = createIndices m
98 (is, ss) = incExcSpeGen $ cooc2mat ti m
99 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
100
101
102 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
103 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
104 where
105 (ti,fi) = createIndices m
106 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
107
108
109
110
111 metrics_text :: Text
112 metrics_text = T.intercalate " " metrics_sentences
113
114 metrics_sentences' :: [Text]
115 metrics_sentences' = splitBy (Sentences 0) metrics_text
116
117 -- | Sentences
118 metrics_sentences :: [Text]
119 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
120 , "I can see the glass on the table."
121 , "There was only a spoon on that table."
122 , "The glass just fall from the table, pouring wine everywhere."
123 , "I wish the glass did not contain wine."
124 ]
125
126 metrics_sentences_Test = metrics_sentences == metrics_sentences'
127
128 -- | Terms reordered to visually check occurrences
129 -- >>>
130 {- [ [["table"],["glass"],["wine"],["spoon"]]
131 , [["glass"],["table"]]
132 , [["spoon"],["table"]]
133 , [["glass"],["table"],["wine"]]
134 , [["glass"],["wine"]]
135 ]
136 -}
137
138 metrics_terms :: IO [[Terms]]
139 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
140
141 -- | Occurrences
142 {-
143 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
144 , (fromList ["object"],fromList [(["object"], 3 )])
145 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
146 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
147 -}
148 metrics_occ = occurrences <$> L.concat <$> metrics_terms
149
150 {-
151 -- fromList [((["glas"],["object"]),6)
152 ,((["glas"],["spoon"]),4)
153 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
154
155 -}
156 metrics_cooc = cooc <$> metrics_terms
157
158 metrics_cooc_mat = do
159 m <- metrics_cooc
160 let (ti,_) = createIndices m
161 let mat_cooc = cooc2mat ti m
162 pure ( ti
163 , mat_cooc
164 , incExcSpeGen_proba mat_cooc
165 , incExcSpeGen mat_cooc
166 )
167
168 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
169