{-| Module : Gargantext.Viz.Phylo.Tools Description : Phylomemy Tools to build/manage it Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ViewPatterns #-} module Gargantext.Viz.Phylo.Aggregates where import Control.Lens hiding (makeLenses, both, Level) import Gargantext.Prelude hiding (elem) import Gargantext.Text.Context (TermList) import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..)) import Gargantext.Text.Terms.Mono (monoTexts) import Gargantext.Viz.Phylo import Gargantext.Viz.Phylo.Tools import Debug.Trace (trace) import Data.List (partition, concat, nub, elem, sort, (++), null) import Data.Map (Map, fromList, fromListWith, adjust, filterWithKey, toList, elems, keys, unionWith, mapWithKey) import Data.Set (size) import Data.Text (Text, unwords) import Data.Vector (Vector) import qualified Data.Vector as Vector --------------------- -- | Foundations | -- --------------------- -- | Extract all the labels of a termList termListToNgrams :: TermList -> [Ngrams] termListToNgrams l = map (\(lbl,_) -> unwords lbl) l ------------------- -- | Documents | -- ------------------- -- | To group a list of Documents by fixed periods groupDocsByPeriod :: (Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods" groupDocsByPeriod f pds es = trace ("----\nGroup docs by periods\n") $ fromList $ zip pds $ map (inPeriode f es) pds where -------------------------------------- inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t] inPeriode f' h (start,end) = fst $ partition (\d -> f' d >= start && f' d <= end) h -------------------------------------- -- | To parse a list of Documents by filtering on a Vector of Ngrams parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document] parseDocs roots c = map (\(d,t) -> Document d ( filter (\x -> Vector.elem x roots) $ monoTexts t)) c -- | To count the number of documents by year countDocs :: [(Date,a)] -> Map Date Double countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus ----------------- -- | Periods | -- ----------------- -- | To init a list of Periods framed by a starting Date and an ending Date initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)] initPeriods g s (start,end) = map (\l -> (head' "Doc" l, last' "Doc" l)) $ chunkAlong g s [start .. end] -------------- -- | Cooc | -- -------------- -- | To transform a tuple of group's information into a coocurency Matrix toCooc :: [([Int],Double)] -> Map (Int, Int) Double toCooc l = map (/docs) $ foldl (\mem x -> adjust (+1) x mem) cooc $ concat $ map (\x -> listToFullCombi $ fst x) l where -------------------------------------- idx :: [Int] idx = nub $ concat $ map fst l -------------------------------------- docs :: Double docs = sum $ map snd l -------------------------------------- cooc :: Map (Int, Int) (Double) cooc = fromList $ map (\x -> (x,0)) $ listToFullCombi idx -------------------------------------- -- | To reduce a coocurency Matrix to some keys getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx) && (elem (snd k) idx)) cooc -- | To get a coocurency Matrix related to a given list of Periods getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs where -------------------------------------- -- | Here we need to go back to the level 1 (aka : the Fis level) gs :: [PhyloGroup] gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p -------------------------------------- -- | To transform a list of index into a cooc matrix listToCooc :: [Int] -> Map (Int,Int) Double listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst -- | To build the cooc matrix by years out of the corpus docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double) docsToCooc docs fdt = fromListWith sumCooc $ map (\(d,l) -> (d, listToCooc l)) $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs ------------- -- | Fis | -- ------------- -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False) filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis] filterFis keep thr f m = case keep of False -> map (\l -> f thr l) m True -> map (\l -> keepFilled (f) thr l) m -- | To filter Fis with small Support filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis] filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l -- | To filter Fis with small Clique size filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis] filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l -- | To filter nested Fis filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis] filterFisByNested = map (\l -> let cliqueMax = filterNestedSets (head' "Fis" $ map getClique l) (map getClique l) [] in filter (\fis -> elem (getClique fis) cliqueMax) l) -- | Choose if we use a set of Fis from a file or if we have to create them docsToFis :: Map (Date,Date) [Document] -> Phylo -> Phylo docsToFis m p = if (null $ getPhyloFis p) then trace("----\nRebuild the Fis from scratch\n") $ p & phylo_fis .~ mapWithKey (\k docs -> let fis = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs) in map (\f -> PhyloFis (fst f) (snd f) k) fis) m else trace("----\nUse Fis from an existing file\n") $ p & phylo_fis %~ (unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m)) -- | Process some filters on top of a set of Fis refineFis :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis] refineFis fis k s t = traceFis "----\nFiltered Fis by clique size :\n" $ filterFis k t (filterFisByClique) $ traceFis "----\nFiltered Fis by nested :\n" $ filterFisByNested $ traceFis "----\nFiltered Fis by support :\n" $ filterFis k s (filterFisBySupport) $ traceFis "----\nUnfiltered Fis :\n" fis ----------------- -- | Tracers | -- ----------------- traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis] traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n" <> "support : " <> show (countSup 1 supps) <> " (>1) " <> show (countSup 2 supps) <> " (>2) " <> show (countSup 3 supps) <> " (>3) " <> show (countSup 4 supps) <> " (>4) " <> show (countSup 5 supps) <> " (>5) " <> show (countSup 6 supps) <> " (>6)\n" <> "clique size : " <> show (countSup 1 ngrms) <> " (>1) " <> show (countSup 2 ngrms) <> " (>2) " <> show (countSup 3 ngrms) <> " (>3) " <> show (countSup 4 ngrms) <> " (>4) " <> show (countSup 5 ngrms) <> " (>5) " <> show (countSup 6 ngrms) <> " (>6)\n" ) m where -------------------------------------- countSup :: Double -> [Double] -> Int countSup s l = length $ filter (>s) l -------------------------------------- supps :: [Double] supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m -------------------------------------- ngrms :: [Double] ngrms = sort $ map (\f -> fromIntegral $ size $ _phyloFis_clique f) $ concat $ elems m --------------------------------------