]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[CLEAN] code kmeans.
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21
22 module Gargantext.Text.Metrics
23 where
24
25 import Data.Text (Text, pack)
26 import Data.Map (Map)
27 import qualified Data.List as L
28 import qualified Data.Map as M
29 import qualified Data.Set as S
30 import qualified Data.Text as T
31 import qualified Data.Vector as V
32 import qualified Data.Vector.Unboxed as VU
33 import Data.Tuple.Extra (both)
34 --import GHC.Real (Ratio)
35 --import qualified Data.Text.Metrics as DTM
36 import Data.Array.Accelerate (toList)
37 import Math.KMeans (kmeans, euclidSq, elements)
38
39
40 import Gargantext.Prelude
41
42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
44 import Gargantext.Core (Lang(EN))
45 import Gargantext.Core.Types (Terms(..))
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
47
48 import Gargantext.Viz.Graph.Distances.Matrice
49 import Gargantext.Viz.Graph.Index
50
51 import qualified Data.Array.Accelerate.Interpreter as DAA
52 import qualified Data.Array.Accelerate as DAA
53
54 import GHC.Real (round)
55
56 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
57 --filterCooc m =
58 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
59 ----(ti, fi) = createIndices m
60 -- . fromIndex fi $ filterMat $ cooc2mat ti m
61
62
63 type ListSize = Int
64 type BinSize = Double
65
66 takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
67 takeSome l s scores = L.take l
68 $ takeSample n m
69 $ splitKmeans 2 scores
70 where
71 -- TODO : KMEAN split into 2 main clusters
72 -- (TODO: benchmark with accelerate-example kmeans version)
73 splitKmeans x xs = elements
74 $ V.head
75 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
76 euclidSq x xs
77 n = round ((fromIntegral l)/s)
78 m = round $ (fromIntegral $ length scores) / (s)
79 takeSample n m xs = L.concat $ map (L.take n)
80 $ L.reverse $ map (L.sortOn _scored_incExc)
81 $ splitEvery m
82 $ L.reverse $ L.sortOn _scored_speGen xs
83
84
85 data Scored t = Scored { _scored_terms :: t
86 , _scored_incExc :: InclusionExclusion
87 , _scored_speGen :: SpecificityGenericity
88 } deriving (Show)
89
90 incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
91 incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
92 where
93 (ti,fi) = createIndices m
94 (is, ss) = incExcSpeGen $ cooc2mat ti m
95 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
96
97
98 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
99 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
100 where
101 (ti,fi) = createIndices m
102 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
103
104
105
106
107 metrics_text :: Text
108 metrics_text = T.intercalate " " metrics_sentences
109
110 metrics_sentences' :: [Text]
111 metrics_sentences' = splitBy (Sentences 0) metrics_text
112
113 -- | Sentences
114 metrics_sentences :: [Text]
115 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
116 , "I can see the glass on the table."
117 , "There was only a spoon on that table."
118 , "The glass just fall from the table, pouring wine everywhere."
119 , "I wish the glass did not contain wine."
120 ]
121
122 metrics_sentences_Test = metrics_sentences == metrics_sentences'
123
124 -- | Terms reordered to visually check occurrences
125 -- >>>
126 {- [ [["table"],["glass"],["wine"],["spoon"]]
127 , [["glass"],["table"]]
128 , [["spoon"],["table"]]
129 , [["glass"],["table"],["wine"]]
130 , [["glass"],["wine"]]
131 ]
132 -}
133
134 metrics_terms :: IO [[Terms]]
135 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
136
137 -- | Occurrences
138 {-
139 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
140 , (fromList ["object"],fromList [(["object"], 3 )])
141 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
142 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
143 -}
144 metrics_occ = occurrences <$> L.concat <$> metrics_terms
145
146 {-
147 -- fromList [((["glas"],["object"]),6)
148 ,((["glas"],["spoon"]),4)
149 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
150
151 -}
152 metrics_cooc = cooc <$> metrics_terms
153
154 metrics_cooc_mat = do
155 m <- metrics_cooc
156 let (ti,_) = createIndices m
157 let mat_cooc = cooc2mat ti m
158 pure ( ti
159 , mat_cooc
160 , incExcSpeGen_proba mat_cooc
161 , incExcSpeGen mat_cooc
162 )
163
164 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
165