2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.Aggregates
21 import Control.Lens hiding (makeLenses, both, Level)
23 import Gargantext.Prelude hiding (elem)
24 import Gargantext.Text.Context (TermList)
25 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
26 import Gargantext.Text.Terms.Mono (monoTexts)
27 import Gargantext.Viz.Phylo
28 import Gargantext.Viz.Phylo.Tools
30 import Debug.Trace (trace)
32 import Data.List (partition, concat, nub, elem, sort, (++), null)
33 import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, toList, elems, keys, unionWith, mapWithKey)
34 import Data.Set (size)
35 import Data.Text (Text, unwords)
36 import Data.Vector (Vector)
38 import qualified Data.Vector as Vector
46 -- | Extract all the labels of a termList
47 termListToNgrams :: TermList -> [Ngrams]
48 termListToNgrams = map (\(lbl,_) -> unwords lbl)
55 -- | To group a list of Documents by fixed periods
56 groupDocsByPeriod :: (Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
57 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
58 groupDocsByPeriod f pds es = trace ("----\nGroup docs by periods\n") $ fromList $ zip pds $ map (inPeriode f es) pds
60 --------------------------------------
61 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
62 inPeriode f' h (start,end) =
63 fst $ partition (\d -> f' d >= start && f' d <= end) h
64 --------------------------------------
67 -- | To parse a list of Documents by filtering on a Vector of Ngrams
68 parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
69 parseDocs roots c = map (\(d,t)
70 -> Document d ( filter (\x -> Vector.elem x roots)
73 -- | To count the number of documents by year
74 countDocs :: [(Date,a)] -> Map Date Double
75 countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
83 -- | To init a list of Periods framed by a starting Date and an ending Date
84 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
85 initPeriods g s (start,end) = map (\l -> (head' "initPeriods" l, last' "initPeriods" l))
86 $ chunkAlong g s [start .. end]
94 -- | To transform a tuple of group's information into a coocurency Matrix
95 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
96 toCooc l = map (/docs)
97 $ foldl (\mem x -> adjust (+1) x mem) cooc
99 $ map (\x -> listToFullCombi $ fst x) l
101 --------------------------------------
103 idx = nub $ concat $ map fst l
104 --------------------------------------
106 docs = sum $ map snd l
107 --------------------------------------
108 cooc :: Map (Int, Int) (Double)
109 cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx
110 --------------------------------------
113 -- | To reduce a coocurency Matrix to some keys
114 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
115 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
116 && (elem (snd k) idx)) cooc
119 -- | To get a coocurency Matrix related to a given list of Periods
120 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
121 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
123 --------------------------------------
124 -- | Here we need to go back to the level 1 (aka : the Fis level)
126 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
127 --------------------------------------
130 -- | To transform a list of index into a cooc matrix
131 listToCooc :: [Int] -> Map (Int,Int) Double
132 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
135 -- | To build the cooc matrix by years out of the corpus
136 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
137 docsToCooc docs fdt = fromListWith sumCooc
138 $ map (\(d,l) -> (d, listToCooc l))
139 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
147 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
148 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
149 filterFis keep thr f m = case keep of
150 False -> map (\l -> f thr l) m
151 True -> map (\l -> keepFilled (f) thr l) m
154 -- | To filter Fis with small Support
155 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
156 filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
159 -- | To filter Fis with small Clique size
160 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
161 filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
164 -- | To filter nested Fis
165 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
166 filterFisByNested = map (\l -> let cliqueMax = filterNestedSets (head' "Fis" $ map getClique l) (map getClique l) []
167 in filter (\fis -> elem (getClique fis) cliqueMax) l)
170 -- | Choose if we use a set of Fis from a file or if we have to create them
171 docsToFis :: Map (Date,Date) [Document] -> Phylo -> Phylo
172 docsToFis m p = if (null $ getPhyloFis p)
173 then trace("----\nRebuild the Fis from scratch\n")
174 $ p & phylo_fis .~ mapWithKey (\k docs -> let fis = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
175 in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
176 else trace("----\nUse Fis from an existing file\n")
177 $ p & phylo_fis %~ (unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m))
180 -- | Process some filters on top of a set of Fis
181 refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
182 refineFis fis k s t = traceFis "----\nFiltered Fis by clique size :\n"
183 $ filterFis k t (filterFisByClique)
184 $ traceFis "----\nFiltered Fis by nested :\n"
186 $ traceFis "----\nFiltered Fis by support :\n"
187 $ filterFis k s (filterFisBySupport)
188 $ traceFis "----\nUnfiltered Fis :\n" fis
196 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
197 traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
198 <> "support : " <> show (countSup 1 supps) <> " (>1) "
199 <> show (countSup 2 supps) <> " (>2) "
200 <> show (countSup 3 supps) <> " (>3) "
201 <> show (countSup 4 supps) <> " (>4) "
202 <> show (countSup 5 supps) <> " (>5) "
203 <> show (countSup 6 supps) <> " (>6)\n"
204 <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) "
205 <> show (countSup 2 ngrms) <> " (>2) "
206 <> show (countSup 3 ngrms) <> " (>3) "
207 <> show (countSup 4 ngrms) <> " (>4) "
208 <> show (countSup 5 ngrms) <> " (>5) "
209 <> show (countSup 6 ngrms) <> " (>6)\n"
212 --------------------------------------
213 countSup :: Double -> [Double] -> Int
214 countSup s l = length $ filter (>s) l
215 --------------------------------------
217 supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
218 --------------------------------------
220 ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m
221 --------------------------------------