Portability : POSIX
-}
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.PhyloMaker where
-import Data.List (concat, nub, partition, sort, (++))
-import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), filterWithKey, restrictKeys)
-import Data.Set (size)
+import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
+import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
+import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
+import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
+import Gargantext.Viz.Graph.MaxClique (getMaxCliques)
+import Gargantext.Viz.Graph.Distances (Distance(Conditional))
import Control.DeepSeq (NFData)
import Control.Parallel.Strategies (parList, rdeepseq, using)
toPhylo :: [Document] -> TermList -> Config -> Phylo
-toPhylo docs lst conf = phylo1
+toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
+ $ traceToPhylo (phyloLevel conf) $
+ if (phyloLevel conf) > 1
+ then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
+ else phylo1
where
--------------------------------------
phylo1 :: Phylo
-- | To Phylo 1 | --
--------------------
-
-appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
+toGroupsProxi :: Level -> Phylo -> Phylo
+toGroupsProxi lvl phylo =
+ let proximity = phyloProximity $ getConfig phylo
+ groupsProxi = foldlWithKey (\acc pId pds ->
+ -- 1) process period by period
+ let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
+ $ elems
+ $ view ( phylo_periodLevels
+ . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
+ . phylo_levelGroups ) pds
+ next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
+ targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
+ docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
+ diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
+ -- 2) compute the pairs in parallel
+ pairs = map (\(id,ngrams) ->
+ map (\(id',ngrams') ->
+ let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
+ diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
+ in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
+ ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
+ ) egos
+ pairs' = pairs `using` parList rdeepseq
+ in acc ++ (concat pairs')
+ ) [] $ phylo ^. phylo_periods
+ in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
+
+
+appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
$ over ( phylo_periods
. traverse
(\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
then
let pId = phyloLvl ^. phylo_levelPeriod
- phyloFis = m ! pId
+ phyloCUnit = m ! pId
in phyloLvl
& phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
groups ++ [ (((pId,lvl),length groups)
- , f obj pId lvl (length groups) (getRoots phylo)
+ , f obj pId lvl (length groups)
(elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
- ] ) [] phyloFis)
+ ] ) [] phyloCUnit)
else
phyloLvl )
phylo
-fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
-fisToGroup fis pId lvl idx fdt coocs =
- let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
- in PhyloGroup pId lvl idx
- (fis ^. phyloFis_support)
- ngrams
- (ngramsToCooc ngrams coocs)
- (1,[])
- [] [] [] [] []
+cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
+cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
+ (fis ^. phyloClique_support)
+ (fis ^. phyloClique_nodes)
+ (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
+ (1,[0]) -- branchid (lvl,[path in the branching tree])
+ (fromList [("breaks",[0]),("seaLevels",[0])])
+ [] [] [] []
toPhylo1 :: [Document] -> Phylo -> Phylo
-toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
+toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
+ Constante start gap -> constanteTemporalMatching start gap
+ $ appendGroups cliqueToGroup 1 phyloClique phyloBase
+ Adaptative steps -> adaptativeTemporalMatching steps
+ $ toGroupsProxi 1
+ $ appendGroups cliqueToGroup 1 phyloClique phyloBase
where
--------------------------------------
- phyloFis :: Map (Date,Date) [PhyloFis]
- phyloFis = toPhyloFis docs' (getFisSupport $ contextualUnit $ getConfig phyloBase) (getFisSize $ contextualUnit $ getConfig phyloBase)
+ phyloClique :: Map (Date,Date) [PhyloClique]
+ phyloClique = toPhyloClique phyloBase docs'
--------------------------------------
docs' :: Map (Date,Date) [Document]
- docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
+ docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
+ -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
--------------------------------------
---------------------------
--- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
-filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
-filterFis keep thr f m = case keep of
+-- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
+filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
+filterClique keep thr f m = case keep of
False -> map (\l -> f thr l) m
True -> map (\l -> keepFilled (f) thr l) m
--- | To filter Fis with small Support
-filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
-filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
+-- To filter Fis with small Support
+filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
+filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
--- | To filter Fis with small Clique size
-filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
-filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
+-- To filter Fis with small Clique size
+filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
+filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
--- | To filter nested Fis
-filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
-filterFisByNested m =
- let fis = map (\l ->
- foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
+-- To filter nested Fis
+filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
+filterCliqueByNested m =
+ let clq = map (\l ->
+ foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
then mem
else
- let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
+ let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
in fMax ++ [f] ) [] l)
$ elems m
- fis' = fis `using` parList rdeepseq
- in fromList $ zip (keys m) fis'
-
-
--- | To transform a time map of docs innto a time map of Fis with some filters
-toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
-toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
- $ filterFisByNested
- $ traceFis "Filtered by clique size"
- $ filterFis True clique (filterFisByClique)
- $ traceFis "Filtered by support"
- $ filterFis True support (filterFisBySupport)
- $ traceFis "Unfiltered Fis" phyloFis
+ clq' = clq `using` parList rdeepseq
+ in fromList $ zip (keys m) clq'
+
+
+-- | To transform a time map of docs into a time map of Fis with some filters
+toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
+toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
+ Fis s s' -> -- traceFis "Filtered Fis"
+ filterCliqueByNested
+ {- \$ traceFis "Filtered by clique size" -}
+ $ filterClique True s' (filterCliqueBySize)
+ {- \$ traceFis "Filtered by support" -}
+ $ filterClique True s (filterCliqueBySupport)
+ {- \$ traceFis "Unfiltered Fis" -}
+ phyloClique
+ MaxClique s -> filterClique True s (filterCliqueBySize)
+ phyloClique
where
--------------------------------------
- phyloFis :: Map (Date,Date) [PhyloFis]
- phyloFis =
- let fis = map (\(prd,docs) -> let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
- in (prd, map (\f -> PhyloFis (fst f) (snd f) prd) lst))
- $ toList phyloDocs
- fis' = fis `using` parList rdeepseq
- in fromList fis'
+ phyloClique :: Map (Date,Date) [PhyloClique]
+ phyloClique = case (clique $ getConfig phylo) of
+ Fis _ _ ->
+ let fis = map (\(prd,docs) ->
+ let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
+ in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
+ $ toList phyloDocs
+ fis' = fis `using` parList rdeepseq
+ in fromList fis'
+ MaxClique _ ->
+ let mcl = map (\(prd,docs) ->
+ let cooc = map round
+ $ foldl sumCooc empty
+ $ map listToMatrix
+ $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
+ in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
+ $ toList phyloDocs
+ mcl' = mcl `using` parList rdeepseq
+ in fromList mcl'
--------------------------------------
+ -- dev viz graph maxClique getMaxClique
+
--------------------
-- | Coocurency | --
--------------------
--- | To build the local cooc matrix of each phylogroup
-ngramsToCooc :: [Int] -> [Cooc] -> Cooc
-ngramsToCooc ngrams coocs =
- let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
- pairs = listToKeys ngrams
- in filterWithKey (\k _ -> elem k pairs) cooc
-
-
--- | To transform the docs into a time map of coocurency matrix
+-- To transform the docs into a time map of coocurency matrix
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
docsToTimeScaleCooc docs fdt =
let mCooc = fromListWith sumCooc
mCooc' = fromList
$ map (\t -> (t,empty))
$ toTimeScale (map date docs) 1
- in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
+ in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
$ unionWith sumCooc mCooc mCooc'
-- | to Phylo Base | --
-----------------------
+groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
+groupDocsByPeriodRec f prds docs acc =
+ if ((null prds) || (null docs))
+ then acc
+ else
+ let prd = head' "groupBy" prds
+ docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
+ in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
+
+
+-- To group a list of Documents by fixed periods
+groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
+groupDocsByPeriod' f pds docs =
+ let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
+ periods = map (inPeriode f docs') pds
+ periods' = periods `using` parList rdeepseq
+ in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
+ $ fromList $ zip pds periods'
+ where
+ --------------------------------------
+ inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
+ inPeriode f' h (start,end) =
+ concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
+
--- | To group a list of Documents by fixed periods
+
+-- To group a list of Documents by fixed periods
groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
groupDocsByPeriod f pds es =
--------------------------------------
--- | To count the number of docs by unit of time
+docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
+docsToTermFreq docs fdt =
+ let nbDocs = fromIntegral $ length docs
+ freqs = map (/(nbDocs))
+ $ fromList
+ $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
+ $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
+ sumFreqs = sum $ elems freqs
+ in map (/sumFreqs) freqs
+
+
+-- To count the number of docs by unit of time
docsToTimeScaleNb :: [Document] -> Map Date Double
docsToTimeScaleNb docs =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
--- | To init the basic elements of a Phylo
+-- To init the basic elements of a Phylo
toPhyloBase :: [Document] -> TermList -> Config -> Phylo
toPhyloBase docs lst conf =
let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
$ Phylo foundations
(docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs)
+ (docsToTermFreq docs (foundations ^. foundations_roots))
+ empty
+ empty
params
- (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)
+ (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)