2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Viz.Phylo.Aggregates
16 import Control.Parallel.Strategies
18 import Gargantext.Prelude hiding (elem)
19 import Gargantext.Text.Context (TermList)
20 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
21 import Gargantext.Text.Terms.Mono (monoTexts)
22 import Gargantext.Viz.Phylo
23 import Gargantext.Viz.Phylo.Tools
25 import Debug.Trace (trace)
27 import Data.List (partition, concat, nub, elem, sort, (++), null, union)
28 import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, elems, keys, unionWith, mapWithKey)
29 import Data.Set (size)
30 import Data.Text (Text, unwords)
31 import Data.Vector (Vector)
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
44 -- | Extract all the labels of a termList
45 termListToNgrams :: TermList -> [Ngrams]
46 termListToNgrams = map (\(lbl,_) -> unwords lbl)
53 -- | To group a list of Documents by fixed periods
54 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
55 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
56 groupDocsByPeriod f pds es =
57 let periods = map (inPeriode f es) pds
58 periods' = periods `using` parList rdeepseq
60 in trace ("----\nGroup docs by periods\n") $ fromList $ zip pds periods'
62 --------------------------------------
63 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
64 inPeriode f' h (start,end) =
65 fst $ partition (\d -> f' d >= start && f' d <= end) h
66 --------------------------------------
69 -- | To parse a list of Documents by filtering on a Vector of Ngrams
70 parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
71 parseDocs roots c = map (\(d,t)
72 -> Document d ( filter (\x -> Vector.elem x roots)
75 -- | To count the number of documents by year
76 countDocs :: [(Date,a)] -> Map Date Double
77 countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
85 -- | To init a list of Periods framed by a starting Date and an ending Date
86 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
87 initPeriods g s (start,end) = map (\l -> (head' "initPeriods" l, last' "initPeriods" l))
88 $ chunkAlong g s [start .. end]
96 -- | To transform a tuple of group's information into a coocurency Matrix
97 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
98 toCooc l = map (/docs)
99 $ foldl (\mem x -> adjust (+1) x mem) cooc
101 $ map (\x -> listToFullCombi $ fst x) l
103 --------------------------------------
105 idx = nub $ concat $ map fst l
106 --------------------------------------
108 docs = sum $ map snd l
109 --------------------------------------
110 cooc :: Map (Int, Int) (Double)
111 cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx
112 --------------------------------------
115 -- | To reduce a coocurency Matrix to some keys
116 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
117 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
118 && (elem (snd k) idx)) cooc
121 -- | To get a coocurency Matrix related to a given list of Periods
122 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
123 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
125 --------------------------------------
126 -- | Here we need to go back to the level 1 (aka : the Fis level)
128 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
129 --------------------------------------
132 -- | To transform a list of index into a cooc matrix
133 listToCooc :: [Int] -> Map (Int,Int) Double
134 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
137 -- | To build the cooc matrix by years out of the corpus
138 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
139 docsToCooc docs fdt = fromListWith sumCooc
140 $ map (\(d,l) -> (d, listToCooc l))
141 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
149 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
150 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
151 filterFis keep thr f m = case keep of
152 False -> map (\l -> f thr l) m
153 True -> map (\l -> keepFilled (f) thr l) m
156 -- | To filter Fis with small Support
157 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
158 filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
161 -- | To filter Fis with small Clique size
162 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
163 filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
166 -- | To find if l' is nested in l
167 isNested :: Eq a => [a] -> [a] -> Bool
170 | length l' > length l = False
171 | (union l l') == l = True
175 -- | To filter nested Fis
176 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
177 filterFisByNested m =
179 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ getClique f') (Set.toList $ getClique f)) mem)
182 let fMax = filter (\f' -> not $ isNested (Set.toList $ getClique f) (Set.toList $ getClique f')) mem
183 in fMax ++ [f] ) [] l)
185 fis' = fis `using` parList rdeepseq
186 in fromList $ zip (keys m) fis'
189 -- | Choose if we use a set of Fis from a file or if we have to create them
190 docsToFis :: Map (Date,Date) [Document] -> Phylo -> Map (Date, Date) [PhyloFis]
191 docsToFis m p = if (null $ getPhyloFis p)
192 then trace("----\nRebuild the Fis from scratch\n")
193 $ mapWithKey (\k docs -> let fis = Map.toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
194 in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
195 else trace("----\nUse Fis from an existing file\n")
196 $ unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m) (getPhyloFis p)
199 -- | Process some filters on top of a set of Fis
200 refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
201 refineFis fis k s t = traceFis "----\nFiltered Fis by nested :\n"
203 $ traceFis "----\nFiltered Fis by clique size :\n"
204 $ filterFis k t (filterFisByClique)
205 $ traceFis "----\nFiltered Fis by support :\n"
206 $ filterFis k s (filterFisBySupport)
207 $ traceFis "----\nUnfiltered Fis :\n" fis
215 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
216 traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
217 <> "support : " <> show (countSup 1 supps) <> " (>1) "
218 <> show (countSup 2 supps) <> " (>2) "
219 <> show (countSup 3 supps) <> " (>3) "
220 <> show (countSup 4 supps) <> " (>4) "
221 <> show (countSup 5 supps) <> " (>5) "
222 <> show (countSup 6 supps) <> " (>6)\n"
223 <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) "
224 <> show (countSup 2 ngrms) <> " (>2) "
225 <> show (countSup 3 ngrms) <> " (>3) "
226 <> show (countSup 4 ngrms) <> " (>4) "
227 <> show (countSup 5 ngrms) <> " (>5) "
228 <> show (countSup 6 ngrms) <> " (>6)\n"
231 --------------------------------------
232 countSup :: Double -> [Double] -> Int
233 countSup s l = length $ filter (>s) l
234 --------------------------------------
236 supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
237 --------------------------------------
239 ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m
240 --------------------------------------