2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.Aggregates
20 import Control.Parallel.Strategies
22 import Gargantext.Prelude hiding (elem)
23 import Gargantext.Text.Context (TermList)
24 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Text.Terms.Mono (monoTexts)
26 import Gargantext.Viz.Phylo
27 import Gargantext.Viz.Phylo.Tools
29 import Debug.Trace (trace)
31 import Data.List (partition, concat, nub, elem, sort, (++), null, union)
32 import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, elems, keys, unionWith, mapWithKey)
33 import Data.Set (size)
34 import Data.Text (Text, unwords)
35 import Data.Vector (Vector)
37 import qualified Data.Map as Map
38 import qualified Data.Set as Set
40 import qualified Data.Vector as Vector
48 -- | Extract all the labels of a termList
49 termListToNgrams :: TermList -> [Ngrams]
50 termListToNgrams = map (\(lbl,_) -> unwords lbl)
57 -- | To group a list of Documents by fixed periods
58 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
59 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
60 groupDocsByPeriod f pds es =
61 let periods = map (inPeriode f es) pds
62 periods' = periods `using` parList rdeepseq
64 in trace ("----\nGroup docs by periods\n") $ fromList $ zip pds periods'
66 --------------------------------------
67 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
68 inPeriode f' h (start,end) =
69 fst $ partition (\d -> f' d >= start && f' d <= end) h
70 --------------------------------------
73 -- | To parse a list of Documents by filtering on a Vector of Ngrams
74 parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
75 parseDocs roots c = map (\(d,t)
76 -> Document d ( filter (\x -> Vector.elem x roots)
79 -- | To count the number of documents by year
80 countDocs :: [(Date,a)] -> Map Date Double
81 countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
89 -- | To init a list of Periods framed by a starting Date and an ending Date
90 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
91 initPeriods g s (start,end) = map (\l -> (head' "initPeriods" l, last' "initPeriods" l))
92 $ chunkAlong g s [start .. end]
100 -- | To transform a tuple of group's information into a coocurency Matrix
101 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
102 toCooc l = map (/docs)
103 $ foldl (\mem x -> adjust (+1) x mem) cooc
105 $ map (\x -> listToFullCombi $ fst x) l
107 --------------------------------------
109 idx = nub $ concat $ map fst l
110 --------------------------------------
112 docs = sum $ map snd l
113 --------------------------------------
114 cooc :: Map (Int, Int) (Double)
115 cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx
116 --------------------------------------
119 -- | To reduce a coocurency Matrix to some keys
120 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
121 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
122 && (elem (snd k) idx)) cooc
125 -- | To get a coocurency Matrix related to a given list of Periods
126 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
127 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
129 --------------------------------------
130 -- | Here we need to go back to the level 1 (aka : the Fis level)
132 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
133 --------------------------------------
136 -- | To transform a list of index into a cooc matrix
137 listToCooc :: [Int] -> Map (Int,Int) Double
138 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
141 -- | To build the cooc matrix by years out of the corpus
142 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
143 docsToCooc docs fdt = fromListWith sumCooc
144 $ map (\(d,l) -> (d, listToCooc l))
145 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
153 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
154 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
155 filterFis keep thr f m = case keep of
156 False -> map (\l -> f thr l) m
157 True -> map (\l -> keepFilled (f) thr l) m
160 -- | To filter Fis with small Support
161 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
162 filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
165 -- | To filter Fis with small Clique size
166 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
167 filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
170 -- | To find if l' is nested in l
171 isNested :: Eq a => [a] -> [a] -> Bool
174 | length l' > length l = False
175 | (union l l') == l = True
179 -- | To filter nested Fis
180 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
181 filterFisByNested m =
183 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ getClique f') (Set.toList $ getClique f)) mem)
186 let fMax = filter (\f' -> not $ isNested (Set.toList $ getClique f) (Set.toList $ getClique f')) mem
187 in fMax ++ [f] ) [] l)
189 fis' = fis `using` parList rdeepseq
190 in fromList $ zip (keys m) fis'
193 -- | Choose if we use a set of Fis from a file or if we have to create them
194 docsToFis :: Map (Date,Date) [Document] -> Phylo -> Map (Date, Date) [PhyloFis]
195 docsToFis m p = if (null $ getPhyloFis p)
196 then trace("----\nRebuild the Fis from scratch\n")
197 $ mapWithKey (\k docs -> let fis = Map.toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
198 in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
199 else trace("----\nUse Fis from an existing file\n")
200 $ unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m) (getPhyloFis p)
203 -- | Process some filters on top of a set of Fis
204 refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
205 refineFis fis k s t = traceFis "----\nFiltered Fis by nested :\n"
207 $ traceFis "----\nFiltered Fis by clique size :\n"
208 $ filterFis k t (filterFisByClique)
209 $ traceFis "----\nFiltered Fis by support :\n"
210 $ filterFis k s (filterFisBySupport)
211 $ traceFis "----\nUnfiltered Fis :\n" fis
219 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
220 traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
221 <> "support : " <> show (countSup 1 supps) <> " (>1) "
222 <> show (countSup 2 supps) <> " (>2) "
223 <> show (countSup 3 supps) <> " (>3) "
224 <> show (countSup 4 supps) <> " (>4) "
225 <> show (countSup 5 supps) <> " (>5) "
226 <> show (countSup 6 supps) <> " (>6)\n"
227 <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) "
228 <> show (countSup 2 ngrms) <> " (>2) "
229 <> show (countSup 3 ngrms) <> " (>3) "
230 <> show (countSup 4 ngrms) <> " (>4) "
231 <> show (countSup 5 ngrms) <> " (>5) "
232 <> show (countSup 6 ngrms) <> " (>6)\n"
235 --------------------------------------
236 countSup :: Double -> [Double] -> Int
237 countSup s l = length $ filter (>s) l
238 --------------------------------------
240 supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
241 --------------------------------------
243 ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m
244 --------------------------------------