]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Aggregates.hs
[PHYLO] merge
[gargantext.git] / src / Gargantext / Viz / Phylo / Aggregates.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.Aggregates
18 where
19
20 import Control.Parallel.Strategies
21
22 import Gargantext.Prelude hiding (elem)
23 import Gargantext.Text.Context (TermList)
24 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Text.Terms.Mono (monoTexts)
26 import Gargantext.Viz.Phylo
27 import Gargantext.Viz.Phylo.Tools
28
29 import Debug.Trace (trace)
30
31 import Data.List (partition, concat, nub, elem, sort, (++), null, union)
32 import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, elems, keys, unionWith, mapWithKey)
33 import Data.Set (size)
34 import Data.Text (Text, unwords)
35 import Data.Vector (Vector)
36
37 import qualified Data.Map as Map
38 import qualified Data.Set as Set
39
40 import qualified Data.Vector as Vector
41
42
43
44 ---------------------
45 -- | Foundations | --
46 ---------------------
47
48 -- | Extract all the labels of a termList
49 termListToNgrams :: TermList -> [Ngrams]
50 termListToNgrams = map (\(lbl,_) -> unwords lbl)
51
52
53 -------------------
54 -- | Documents | --
55 -------------------
56
57 -- | To group a list of Documents by fixed periods
58 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
59 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
60 groupDocsByPeriod f pds es =
61 let periods = map (inPeriode f es) pds
62 periods' = periods `using` parList rdeepseq
63
64 in trace ("----\nGroup docs by periods\n") $ fromList $ zip pds periods'
65 where
66 --------------------------------------
67 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
68 inPeriode f' h (start,end) =
69 fst $ partition (\d -> f' d >= start && f' d <= end) h
70 --------------------------------------
71
72
73 -- | To parse a list of Documents by filtering on a Vector of Ngrams
74 parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
75 parseDocs roots c = map (\(d,t)
76 -> Document d ( filter (\x -> Vector.elem x roots)
77 $ monoTexts t)) c
78
79 -- | To count the number of documents by year
80 countDocs :: [(Date,a)] -> Map Date Double
81 countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
82
83
84 -----------------
85 -- | Periods | --
86 -----------------
87
88
89 -- | To init a list of Periods framed by a starting Date and an ending Date
90 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
91 initPeriods g s (start,end) = map (\l -> (head' "initPeriods" l, last' "initPeriods" l))
92 $ chunkAlong g s [start .. end]
93
94
95 --------------
96 -- | Cooc | --
97 --------------
98
99
100 -- | To transform a tuple of group's information into a coocurency Matrix
101 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
102 toCooc l = map (/docs)
103 $ foldl (\mem x -> adjust (+1) x mem) cooc
104 $ concat
105 $ map (\x -> listToFullCombi $ fst x) l
106 where
107 --------------------------------------
108 idx :: [Int]
109 idx = nub $ concat $ map fst l
110 --------------------------------------
111 docs :: Double
112 docs = sum $ map snd l
113 --------------------------------------
114 cooc :: Map (Int, Int) (Double)
115 cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx
116 --------------------------------------
117
118
119 -- | To reduce a coocurency Matrix to some keys
120 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
121 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
122 && (elem (snd k) idx)) cooc
123
124
125 -- | To get a coocurency Matrix related to a given list of Periods
126 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
127 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
128 where
129 --------------------------------------
130 -- | Here we need to go back to the level 1 (aka : the Fis level)
131 gs :: [PhyloGroup]
132 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
133 --------------------------------------
134
135
136 -- | To transform a list of index into a cooc matrix
137 listToCooc :: [Int] -> Map (Int,Int) Double
138 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
139
140
141 -- | To build the cooc matrix by years out of the corpus
142 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
143 docsToCooc docs fdt = fromListWith sumCooc
144 $ map (\(d,l) -> (d, listToCooc l))
145 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
146
147
148 -------------
149 -- | Fis | --
150 -------------
151
152
153 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
154 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
155 filterFis keep thr f m = case keep of
156 False -> map (\l -> f thr l) m
157 True -> map (\l -> keepFilled (f) thr l) m
158
159
160 -- | To filter Fis with small Support
161 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
162 filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
163
164
165 -- | To filter Fis with small Clique size
166 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
167 filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
168
169
170 -- | To find if l' is nested in l
171 isNested :: Eq a => [a] -> [a] -> Bool
172 isNested l l'
173 | null l' = True
174 | length l' > length l = False
175 | (union l l') == l = True
176 | otherwise = False
177
178
179 -- | To filter nested Fis
180 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
181 filterFisByNested m =
182 let fis = map (\l ->
183 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ getClique f') (Set.toList $ getClique f)) mem)
184 then mem
185 else
186 let fMax = filter (\f' -> not $ isNested (Set.toList $ getClique f) (Set.toList $ getClique f')) mem
187 in fMax ++ [f] ) [] l)
188 $ elems m
189 fis' = fis `using` parList rdeepseq
190 in fromList $ zip (keys m) fis'
191
192
193 -- | Choose if we use a set of Fis from a file or if we have to create them
194 docsToFis :: Map (Date,Date) [Document] -> Phylo -> Map (Date, Date) [PhyloFis]
195 docsToFis m p = if (null $ getPhyloFis p)
196 then trace("----\nRebuild the Fis from scratch\n")
197 $ mapWithKey (\k docs -> let fis = Map.toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
198 in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
199 else trace("----\nUse Fis from an existing file\n")
200 $ unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m) (getPhyloFis p)
201
202
203 -- | Process some filters on top of a set of Fis
204 refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
205 refineFis fis k s t = traceFis "----\nFiltered Fis by nested :\n"
206 $ filterFisByNested
207 $ traceFis "----\nFiltered Fis by clique size :\n"
208 $ filterFis k t (filterFisByClique)
209 $ traceFis "----\nFiltered Fis by support :\n"
210 $ filterFis k s (filterFisBySupport)
211 $ traceFis "----\nUnfiltered Fis :\n" fis
212
213
214 -----------------
215 -- | Tracers | --
216 -----------------
217
218
219 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
220 traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
221 <> "support : " <> show (countSup 1 supps) <> " (>1) "
222 <> show (countSup 2 supps) <> " (>2) "
223 <> show (countSup 3 supps) <> " (>3) "
224 <> show (countSup 4 supps) <> " (>4) "
225 <> show (countSup 5 supps) <> " (>5) "
226 <> show (countSup 6 supps) <> " (>6)\n"
227 <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) "
228 <> show (countSup 2 ngrms) <> " (>2) "
229 <> show (countSup 3 ngrms) <> " (>3) "
230 <> show (countSup 4 ngrms) <> " (>4) "
231 <> show (countSup 5 ngrms) <> " (>5) "
232 <> show (countSup 6 ngrms) <> " (>6)\n"
233 ) m
234 where
235 --------------------------------------
236 countSup :: Double -> [Double] -> Int
237 countSup s l = length $ filter (>s) l
238 --------------------------------------
239 supps :: [Double]
240 supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
241 --------------------------------------
242 ngrms :: [Double]
243 ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m
244 --------------------------------------