]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[Terms Selection] takeSome function which filters with inclusion/exclusion (relevance...
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12
13 TODO
14 noApax :: Ord a => Map a Occ -> Map a Occ
15 noApax m = M.filter (>1) m
16
17 -}
18
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21
22 module Gargantext.Text.Metrics
23 where
24
25 import Data.Text (Text, pack)
26 import Data.Map (Map)
27
28 import qualified Data.List as L
29 import qualified Data.Map as M
30 import qualified Data.Set as S
31 import qualified Data.Text as T
32 import Data.Tuple.Extra (both)
33 --import GHC.Real (Ratio)
34 --import qualified Data.Text.Metrics as DTM
35 import Data.Array.Accelerate (toList)
36
37
38 import Gargantext.Prelude
39
40 import Gargantext.Text.Metrics.Count (occurrences, cooc)
41 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
42 import Gargantext.Core (Lang(EN))
43 import Gargantext.Core.Types (Terms(..))
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
45
46 import Gargantext.Viz.Graph.Distances.Matrice
47 import Gargantext.Viz.Graph.Index
48
49 import qualified Data.Array.Accelerate.Interpreter as DAA
50 import qualified Data.Array.Accelerate as DAA
51
52 import GHC.Real (round)
53
54 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
55 --filterCooc m =
56 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
57 ----(ti, fi) = createIndices m
58 -- . fromIndex fi $ filterMat $ cooc2mat ti m
59
60
61 type ListSize = Int
62 type BinSize = Double
63
64 takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
65 takeSome l s scores = L.take l
66 $ takeSample n m
67 $ takeKmeans l'
68 $ L.reverse $ L.sortOn _scored_incExc scores
69 where
70 -- TODO : KMEAN split into 2 main clusters
71 -- (advice: use accelerate-example kmeans version
72 -- and maybe benchmark it to be sure)
73 takeKmeans = L.take
74 l' = 4000
75 n = round ((fromIntegral l)/s)
76 m = round $ (fromIntegral $ length scores) / (s)
77 takeSample n m xs = L.concat $ map (L.take n)
78 $ L.reverse $ map (L.sortOn _scored_incExc)
79 $ splitEvery m
80 $ L.reverse $ L.sortOn _scored_speGen xs
81
82
83 data Scored t = Scored { _scored_terms :: t
84 , _scored_incExc :: InclusionExclusion
85 , _scored_speGen :: SpecificityGenericity
86 } deriving (Show)
87
88 incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
89 incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
90 where
91 (ti,fi) = createIndices m
92 (is, ss) = incExcSpeGen $ cooc2mat ti m
93 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
94
95
96 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
97 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
98 where
99 (ti,fi) = createIndices m
100 ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
101
102
103
104
105 metrics_text :: Text
106 metrics_text = T.intercalate " " metrics_sentences
107
108 metrics_sentences' :: [Text]
109 metrics_sentences' = splitBy (Sentences 0) metrics_text
110
111 -- | Sentences
112 metrics_sentences :: [Text]
113 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
114 , "I can see the glass on the table."
115 , "There was only a spoon on that table."
116 , "The glass just fall from the table, pouring wine everywhere."
117 , "I wish the glass did not contain wine."
118 ]
119
120 metrics_sentences_Test = metrics_sentences == metrics_sentences'
121
122 -- | Terms reordered to visually check occurrences
123 -- >>>
124 {- [ [["table"],["glass"],["wine"],["spoon"]]
125 , [["glass"],["table"]]
126 , [["spoon"],["table"]]
127 , [["glass"],["table"],["wine"]]
128 , [["glass"],["wine"]]
129 ]
130 -}
131
132 metrics_terms :: IO [[Terms]]
133 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
134
135 -- | Occurrences
136 {-
137 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
138 , (fromList ["object"],fromList [(["object"], 3 )])
139 , (fromList ["glas"] ,fromList [(["glas"] , 2 )])
140 , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
141 -}
142 metrics_occ = occurrences <$> L.concat <$> metrics_terms
143
144 {-
145 -- fromList [((["glas"],["object"]),6)
146 ,((["glas"],["spoon"]),4)
147 ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
148
149 -}
150 metrics_cooc = cooc <$> metrics_terms
151
152 metrics_cooc_mat = do
153 m <- metrics_cooc
154 let (ti,_) = createIndices m
155 let mat_cooc = cooc2mat ti m
156 pure ( ti
157 , mat_cooc
158 , incExcSpeGen_proba mat_cooc
159 , incExcSpeGen mat_cooc
160 )
161
162 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
163