]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/Aggregates.hs
working on a new distance in phylo
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / Aggregates.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Core.Viz.Phylo.Aggregates
14 where
15
16 import Control.Parallel.Strategies
17
18 import Gargantext.Prelude hiding (elem)
19 import Gargantext.Core.Text.Context (TermList)
20 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
21 import Gargantext.Core.Text.Terms.Mono (monoTexts)
22 import Gargantext.Core.Viz.Phylo
23 import Gargantext.Core.Viz.Phylo.Tools
24
25 import Debug.Trace (trace)
26
27 import Data.List (partition, concat, nub, elem, sort, (++), null, union)
28 import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, elems, keys, unionWith, mapWithKey)
29 import Data.Set (size)
30 import Data.Text (Text, unwords)
31 import Data.Vector (Vector)
32
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35
36 import qualified Data.Vector as Vector
37
38
39
40 ---------------------
41 -- | Foundations | --
42 ---------------------
43
44 -- | Extract all the labels of a termList
45 termListToNgrams :: TermList -> [Ngrams]
46 termListToNgrams = map (\(lbl,_) -> unwords lbl)
47
48
49 -------------------
50 -- | Documents | --
51 -------------------
52
53 -- | To group a list of Documents by fixed periods
54 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
55 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
56 groupDocsByPeriod f pds es =
57 let periods = map (inPeriode f es) pds
58 periods' = periods `using` parList rdeepseq
59
60 in trace ("----\nGroup docs by periods\n") $ fromList $ zip pds periods'
61 where
62 --------------------------------------
63 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
64 inPeriode f' h (start,end) =
65 fst $ partition (\d -> f' d >= start && f' d <= end) h
66 --------------------------------------
67
68
69 -- | To parse a list of Documents by filtering on a Vector of Ngrams
70 parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
71 parseDocs roots c = map (\(d,t)
72 -> Document d ( filter (\x -> Vector.elem x roots)
73 $ monoTexts t)) c
74
75 -- | To count the number of documents by year
76 countDocs :: [(Date,a)] -> Map Date Double
77 countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
78
79
80 -----------------
81 -- | Periods | --
82 -----------------
83
84
85 -- | To init a list of Periods framed by a starting Date and an ending Date
86 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
87 initPeriods g s (start,end) = map (\l -> (head' "initPeriods" l, last' "initPeriods" l))
88 $ chunkAlong g s [start .. end]
89
90
91 --------------
92 -- | Cooc | --
93 --------------
94
95
96 -- | To transform a tuple of group's information into a coocurency Matrix
97 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
98 toCooc l = map (/docs)
99 $ foldl (\mem x -> adjust (+1) x mem) cooc
100 $ concat
101 $ map (\x -> listToFullCombi $ fst x) l
102 where
103 --------------------------------------
104 idx :: [Int]
105 idx = nub $ concat $ map fst l
106 --------------------------------------
107 docs :: Double
108 docs = sum $ map snd l
109 --------------------------------------
110 cooc :: Map (Int, Int) (Double)
111 cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx
112 --------------------------------------
113
114
115 -- | To reduce a coocurency Matrix to some keys
116 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
117 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
118 && (elem (snd k) idx)) cooc
119
120
121 -- | To get a coocurency Matrix related to a given list of Periods
122 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
123 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
124 where
125 --------------------------------------
126 -- | Here we need to go back to the level 1 (aka : the Fis level)
127 gs :: [PhyloGroup]
128 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
129 --------------------------------------
130
131
132 -- | To transform a list of index into a cooc matrix
133 listToCooc :: [Int] -> Map (Int,Int) Double
134 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
135
136
137 -- | To build the cooc matrix by years out of the corpus
138 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
139 docsToCooc docs fdt = fromListWith sumCooc
140 $ map (\(d,l) -> (d, listToCooc l))
141 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
142
143
144 -------------
145 -- | Fis | --
146 -------------
147
148
149 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
150 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
151 filterFis keep thr f m = case keep of
152 False -> map (\l -> f thr l) m
153 True -> map (\l -> keepFilled (f) thr l) m
154
155
156 -- | To filter Fis with small Support
157 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
158 filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
159
160
161 -- | To filter Fis with small Clique size
162 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
163 filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
164
165
166 -- | To find if l' is nested in l
167 isNested :: Eq a => [a] -> [a] -> Bool
168 isNested l l'
169 | null l' = True
170 | length l' > length l = False
171 | (union l l') == l = True
172 | otherwise = False
173
174
175 -- | To filter nested Fis
176 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
177 filterFisByNested m =
178 let fis = map (\l ->
179 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ getClique f') (Set.toList $ getClique f)) mem)
180 then mem
181 else
182 let fMax = filter (\f' -> not $ isNested (Set.toList $ getClique f) (Set.toList $ getClique f')) mem
183 in fMax ++ [f] ) [] l)
184 $ elems m
185 fis' = fis `using` parList rdeepseq
186 in fromList $ zip (keys m) fis'
187
188
189 -- | Choose if we use a set of Fis from a file or if we have to create them
190 docsToFis :: Map (Date,Date) [Document] -> Phylo -> Map (Date, Date) [PhyloFis]
191 docsToFis m p = if (null $ getPhyloFis p)
192 then trace("----\nRebuild the Fis from scratch\n")
193 $ mapWithKey (\k docs -> let fis = Map.toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
194 in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
195 else trace("----\nUse Fis from an existing file\n")
196 $ unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m) (getPhyloFis p)
197
198
199 -- | Process some filters on top of a set of Fis
200 refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
201 refineFis fis k s t = traceFis "----\nFiltered Fis by nested :\n"
202 $ filterFisByNested
203 $ traceFis "----\nFiltered Fis by clique size :\n"
204 $ filterFis k t (filterFisByClique)
205 $ traceFis "----\nFiltered Fis by support :\n"
206 $ filterFis k s (filterFisBySupport)
207 $ traceFis "----\nUnfiltered Fis :\n" fis
208
209
210 -----------------
211 -- | Tracers | --
212 -----------------
213
214
215 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
216 traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
217 <> "support : " <> show (countSup 1 supps) <> " (>1) "
218 <> show (countSup 2 supps) <> " (>2) "
219 <> show (countSup 3 supps) <> " (>3) "
220 <> show (countSup 4 supps) <> " (>4) "
221 <> show (countSup 5 supps) <> " (>5) "
222 <> show (countSup 6 supps) <> " (>6)\n"
223 <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) "
224 <> show (countSup 2 ngrms) <> " (>2) "
225 <> show (countSup 3 ngrms) <> " (>3) "
226 <> show (countSup 4 ngrms) <> " (>4) "
227 <> show (countSup 5 ngrms) <> " (>5) "
228 <> show (countSup 6 ngrms) <> " (>6)\n"
229 ) m
230 where
231 --------------------------------------
232 countSup :: Double -> [Double] -> Int
233 countSup s l = length $ filter (>s) l
234 --------------------------------------
235 supps :: [Double]
236 supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
237 --------------------------------------
238 ngrms :: [Double]
239 ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m
240 --------------------------------------