]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Aggregates.hs
merge done
[gargantext.git] / src / Gargantext / Viz / Phylo / Aggregates.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.Aggregates
18 where
19
20
21 import Control.Lens hiding (makeLenses, both, Level)
22
23 import Gargantext.Prelude hiding (elem)
24 import Gargantext.Text.Context (TermList)
25 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
26 import Gargantext.Text.Terms.Mono (monoTexts)
27 import Gargantext.Viz.Phylo
28 import Gargantext.Viz.Phylo.Tools
29
30 import Debug.Trace (trace)
31
32 import Data.List (partition, concat, nub, elem, sort, (++), null)
33 import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, toList, elems, keys, unionWith, mapWithKey)
34 import Data.Set (size)
35 import Data.Text (Text, unwords)
36 import Data.Vector (Vector)
37
38 import qualified Data.Vector as Vector
39
40
41
42 ---------------------
43 -- | Foundations | --
44 ---------------------
45
46 -- | Extract all the labels of a termList
47 termListToNgrams :: TermList -> [Ngrams]
48 termListToNgrams = map (\(lbl,_) -> unwords lbl)
49
50
51 -------------------
52 -- | Documents | --
53 -------------------
54
55 -- | To group a list of Documents by fixed periods
56 groupDocsByPeriod :: (Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
57 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
58 groupDocsByPeriod f pds es = trace ("----\nGroup docs by periods\n") $ fromList $ zip pds $ map (inPeriode f es) pds
59 where
60 --------------------------------------
61 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
62 inPeriode f' h (start,end) =
63 fst $ partition (\d -> f' d >= start && f' d <= end) h
64 --------------------------------------
65
66
67 -- | To parse a list of Documents by filtering on a Vector of Ngrams
68 parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
69 parseDocs roots c = map (\(d,t)
70 -> Document d ( filter (\x -> Vector.elem x roots)
71 $ monoTexts t)) c
72
73 -- | To count the number of documents by year
74 countDocs :: [(Date,a)] -> Map Date Double
75 countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
76
77
78 -----------------
79 -- | Periods | --
80 -----------------
81
82
83 -- | To init a list of Periods framed by a starting Date and an ending Date
84 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
85 initPeriods g s (start,end) = map (\l -> (head' "initPeriods" l, last' "initPeriods" l))
86 $ chunkAlong g s [start .. end]
87
88
89 --------------
90 -- | Cooc | --
91 --------------
92
93
94 -- | To transform a tuple of group's information into a coocurency Matrix
95 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
96 toCooc l = map (/docs)
97 $ foldl (\mem x -> adjust (+1) x mem) cooc
98 $ concat
99 $ map (\x -> listToFullCombi $ fst x) l
100 where
101 --------------------------------------
102 idx :: [Int]
103 idx = nub $ concat $ map fst l
104 --------------------------------------
105 docs :: Double
106 docs = sum $ map snd l
107 --------------------------------------
108 cooc :: Map (Int, Int) (Double)
109 cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx
110 --------------------------------------
111
112
113 -- | To reduce a coocurency Matrix to some keys
114 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
115 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
116 && (elem (snd k) idx)) cooc
117
118
119 -- | To get a coocurency Matrix related to a given list of Periods
120 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
121 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
122 where
123 --------------------------------------
124 -- | Here we need to go back to the level 1 (aka : the Fis level)
125 gs :: [PhyloGroup]
126 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
127 --------------------------------------
128
129
130 -- | To transform a list of index into a cooc matrix
131 listToCooc :: [Int] -> Map (Int,Int) Double
132 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
133
134
135 -- | To build the cooc matrix by years out of the corpus
136 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
137 docsToCooc docs fdt = fromListWith sumCooc
138 $ map (\(d,l) -> (d, listToCooc l))
139 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
140
141
142 -------------
143 -- | Fis | --
144 -------------
145
146
147 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
148 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
149 filterFis keep thr f m = case keep of
150 False -> map (\l -> f thr l) m
151 True -> map (\l -> keepFilled (f) thr l) m
152
153
154 -- | To filter Fis with small Support
155 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
156 filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
157
158
159 -- | To filter Fis with small Clique size
160 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
161 filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
162
163
164 -- | To filter nested Fis
165 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
166 filterFisByNested = map (\l -> let cliqueMax = filterNestedSets (head' "Fis" $ map getClique l) (map getClique l) []
167 in filter (\fis -> elem (getClique fis) cliqueMax) l)
168
169
170 -- | Choose if we use a set of Fis from a file or if we have to create them
171 docsToFis :: Map (Date,Date) [Document] -> Phylo -> Phylo
172 docsToFis m p = if (null $ getPhyloFis p)
173 then trace("----\nRebuild the Fis from scratch\n")
174 $ p & phylo_fis .~ mapWithKey (\k docs -> let fis = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
175 in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
176 else trace("----\nUse Fis from an existing file\n")
177 $ p & phylo_fis %~ (unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m))
178
179
180 -- | Process some filters on top of a set of Fis
181 refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
182 refineFis fis k s t = traceFis "----\nFiltered Fis by clique size :\n"
183 $ filterFis k t (filterFisByClique)
184 $ traceFis "----\nFiltered Fis by nested :\n"
185 $ filterFisByNested
186 $ traceFis "----\nFiltered Fis by support :\n"
187 $ filterFis k s (filterFisBySupport)
188 $ traceFis "----\nUnfiltered Fis :\n" fis
189
190
191 -----------------
192 -- | Tracers | --
193 -----------------
194
195
196 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
197 traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
198 <> "support : " <> show (countSup 1 supps) <> " (>1) "
199 <> show (countSup 2 supps) <> " (>2) "
200 <> show (countSup 3 supps) <> " (>3) "
201 <> show (countSup 4 supps) <> " (>4) "
202 <> show (countSup 5 supps) <> " (>5) "
203 <> show (countSup 6 supps) <> " (>6)\n"
204 <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) "
205 <> show (countSup 2 ngrms) <> " (>2) "
206 <> show (countSup 3 ngrms) <> " (>3) "
207 <> show (countSup 4 ngrms) <> " (>4) "
208 <> show (countSup 5 ngrms) <> " (>5) "
209 <> show (countSup 6 ngrms) <> " (>6)\n"
210 ) m
211 where
212 --------------------------------------
213 countSup :: Double -> [Double] -> Int
214 countSup s l = length $ filter (>s) l
215 --------------------------------------
216 supps :: [Double]
217 supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
218 --------------------------------------
219 ngrms :: [Double]
220 ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m
221 --------------------------------------