Portability : POSIX
-}
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.SynchronicClustering where
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
-import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard)
+import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
+import Gargantext.Viz.Phylo.PhyloExport (processDynamics)
-import Data.List ((++), null, intersect, nub, concat, sort)
-import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
+import Data.List ((++), null, intersect, nub, concat, sort, sortOn, all, groupBy, group, maximum)
+import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
+import Data.Text (Text)
import Control.Lens hiding (Level)
-import Debug.Trace (trace)
+import Control.Parallel.Strategies (parList, rdeepseq, using)
+-- import Debug.Trace (trace)
+
+import qualified Data.Map as Map
+import qualified Data.Set as Set
-------------------------
-- | New Level Maker | --
-------------------------
-mergeGroups :: [Cooc] -> PhyloGroupId -> [PhyloGroup] -> PhyloGroup
-mergeGroups coocs id childs =
+mergeBranchIds :: [[Int]] -> [Int]
+mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
+ where
+ -- 2) find the most Up Left ids in the hierarchy of similarity
+ -- mostUpLeft :: [[Int]] -> [[Int]]
+ -- mostUpLeft ids' =
+ -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
+ -- inf = (fst . minimum) groupIds
+ -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
+ -- 1) find the most frequent ids
+ mostFreq' :: [[Int]] -> [[Int]]
+ mostFreq' ids' =
+ let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
+ sup = (fst . maximum) groupIds
+ in map snd $ filter (\gIds -> fst gIds == sup) groupIds
+
+
+mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
+mergeMeta bId groups =
+ let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
+ in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
+
+
+groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
+groupsToBranches' groups =
+ -- run the related component algorithm
+ let egos = map (\g -> [getGroupId g]
+ ++ (map fst $ g ^. phylo_groupPeriodParents)
+ ++ (map fst $ g ^. phylo_groupPeriodChilds) ) $ elems groups
+ graph = relatedComponents egos
+ -- update each group's branch id
+ in map (\ids ->
+ let groups' = elems $ restrictKeys groups (Set.fromList ids)
+ bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
+ in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
+
+
+mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
+mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
- in PhyloGroup (fst $ fst id)
- (snd $ fst id)
- (snd id)
- ""
- (sum $ map _phylo_groupSupport childs)
- ngrams
- (ngramsToCooc ngrams coocs)
- (((head' "mergeGroups" childs) ^. phylo_groupLevel) + 1, snd ((head' "mergeGroups" childs) ^. phylo_groupBranchId))
- empty
- []
- (map (\g -> (getGroupId g, 1)) childs)
- (concat $ map _phylo_groupPeriodParents childs)
- (concat $ map _phylo_groupPeriodChilds childs)
-
-
-addNewLevel :: Level -> Phylo -> Phylo
-addNewLevel lvl phylo =
- over ( phylo_periods
- . traverse )
- (\phyloPrd ->
- phyloPrd & phylo_periodLevels %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl + 1)
- (PhyloLevel (phyloPrd ^. phylo_periodPeriod) (lvl + 1) empty))) phylo
-
-toNextLevel :: Phylo -> [PhyloGroup] -> Phylo
-toNextLevel phylo groups =
- let level = getLastLevel phylo
- phylo' = updatePhyloGroups level (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
- nextGroups = fromListWith (++)
- $ foldlWithKey (\acc k v ->
- let group = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [fst $ fst k]) k v
- in acc ++ [(group ^. phylo_groupPeriod,[group])]) []
- $ fromListWith (++) $ map (\g -> (fst $ head' "nextGroups" $ g ^. phylo_groupLevelParents,[g])) groups
- in trace (">>>>>>>>>>>>>>>>>>>>>>>>" <> show (nextGroups)) over ( phylo_periods
- . traverse
- . phylo_periodLevels
- . traverse
- . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (level + 1)))
- (\phyloLvl -> if member (phyloLvl ^. phylo_levelPeriod) nextGroups
- then phyloLvl & phylo_levelGroups .~ fromList ( map (\g -> (getGroupId g,g))
- $ nextGroups ! (phyloLvl ^. phylo_levelPeriod))
- else phyloLvl
- ) $ addNewLevel level phylo'
-
+ in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
+ (sum $ map _phylo_groupSupport childs) ngrams
+ (ngramsToCooc ngrams coocs)
+ ((snd $ fst id),bId)
+ (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
+ (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
+ (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
+ where
+ --------------------
+ bId :: [Int]
+ bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
+ --------------------
+ updatePointers :: [Pointer] -> [Pointer]
+ updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
+
+
+addPhyloLevel :: Level -> Phylo -> Phylo
+addPhyloLevel lvl phylo =
+ over ( phylo_periods . traverse )
+ (\phyloPrd -> phyloPrd & phylo_periodLevels
+ %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
+
+
+toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
+toNextLevel' phylo groups =
+ let curLvl = getLastLevel phylo
+ oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
+ newGroups = concat $ groupsToBranches'
+ $ fromList $ map (\g -> (getGroupId g, g))
+ $ foldlWithKey (\acc id groups' ->
+ -- 4) create the parent group
+ let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
+ in acc ++ [parent]) []
+ -- 3) group the current groups by parentId
+ $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
+
+ newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
+ in traceSynchronyEnd
+ $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
+ -- 6) update each period at curLvl + 1
+ . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
+ -- 7) by adding the parents
+ (\phyloLvl ->
+ if member (phyloLvl ^. phylo_levelPeriod) newPeriods
+ then phyloLvl & phylo_levelGroups
+ .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
+ else phyloLvl)
+ -- 2) add the curLvl + 1 phyloLevel to the phylo
+ $ addPhyloLevel (curLvl + 1)
+ -- 1) update the current groups (with level parent pointers) in the phylo
+ $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
--------------------
-- | Clustering | --
--------------------
-
-toPairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
-toPairs groups = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))
- $ listToCombi' groups
-
-groupsToEdges :: Proximity -> Double -> Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
-groupsToEdges prox thr docs groups =
- case prox of
- WeightedLogJaccard sens _ _ -> filter (\(_,w) -> w >= thr)
- $ map (\(g,g') -> ((g,g'), weightedLogJaccard sens docs (g ^. phylo_groupCooc) (g' ^. phylo_groupCooc) (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)))
- $ toPairs groups
- _ -> undefined
-
-
-toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
-toRelatedComponents nodes edges = relatedComponents $ ((map (\((g,g'),_) -> [g,g']) edges) ++ (map (\g -> [g]) nodes))
-
-
-reduceBranch :: Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
-reduceBranch prox thr docs branch =
- -- | 1) reduce a branch as a set of periods & groups
+toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
+toPairs strategy groups = case strategy of
+ MergeRegularGroups -> pairs
+ $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
+ MergeAllGroups -> pairs groups
+ where
+ pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
+ pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
+
+
+toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
+toDiamonds groups = foldl' (\acc groups' ->
+ acc ++ ( elems
+ $ Map.filter (\v -> length v > 1)
+ $ fromListWith (++)
+ $ foldl' (\acc' g ->
+ acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
+ $ elems
+ $ Map.filter (\v -> length v > 1)
+ $ fromListWith (++)
+ $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
+
+
+groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
+groupsToEdges prox sync nbDocs diago groups =
+ case sync of
+ ByProximityThreshold thr sens _ strat ->
+ filter (\(_,w) -> w >= thr)
+ $ toEdges sens
+ $ toPairs strat groups
+ ByProximityDistribution sens strat ->
+ let diamonds = sortOn snd
+ $ toEdges sens $ concat
+ $ map (\gs -> toPairs strat gs) $ toDiamonds groups
+ in take (div (length diamonds) 2) diamonds
+ where
+ toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
+ toEdges sens edges =
+ case prox of
+ WeightedLogJaccard _ -> map (\(g,g') ->
+ ((g,g'), weightedLogJaccard' sens nbDocs diago
+ (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
+ _ -> undefined
+
+toParentId :: PhyloGroup -> PhyloGroupId
+toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
+
+
+reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
+reduceGroups prox sync docs diagos branch =
+ -- 1) reduce a branch as a set of periods & groups
let periods = fromListWith (++)
$ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
in (concat . concat . elems)
- $ mapWithKey (\prd groups ->
- -- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
- let edges = groupsToEdges prox thr ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
- in map (\(idx,comp) ->
- -- | 4) add to each groups their futur level parent group
- let parentId = (((head' "reduceBranch" comp) ^. phylo_groupPeriod, 1 + (head' "reduceBranch" comp) ^. phylo_groupLevel), idx)
+ $ mapWithKey (\prd groups ->
+ -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
+ let diago = reduceDiagos $ filterDiago diagos [prd]
+ edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
+ in map (\comp ->
+ -- 4) add to each groups their futur level parent group
+ let parentId = toParentId (head' "parentId" comp)
in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
- -- |3) reduce the graph a a set of related components
- $ zip [1..] (toRelatedComponents groups edges)) periods
+ -- 3) reduce the graph a a set of related components
+ $ toRelatedComponents groups edges) periods
+
+
+adjustClustering :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
+adjustClustering sync branches = case sync of
+ ByProximityThreshold _ _ scope _ -> case scope of
+ SingleBranch -> branches
+ SiblingBranches -> groupBy (\g g' -> (last' "adjustClustering" $ (g ^. phylo_groupMeta) ! "breaks")
+ == (last' "adjustClustering" $ (g' ^. phylo_groupMeta) ! "breaks"))
+ $ sortOn _phylo_groupBranchId $ concat branches
+ AllBranches -> [concat branches]
+ ByProximityDistribution _ _ -> branches
+
synchronicClustering :: Phylo -> Phylo
-synchronicClustering phylo =
- case (phyloSynchrony $ getConfig phylo) of
- ByProximityThreshold thr -> toNextLevel phylo
- $ concat
- $ map (\branch -> reduceBranch (phyloProximity $ getConfig phylo) thr (phylo ^. phylo_timeDocs) branch)
- $ phyloToLastBranches phylo
- ByProximityDistribution -> undefined
\ No newline at end of file
+synchronicClustering phylo =
+ let prox = phyloProximity $ getConfig phylo
+ sync = phyloSynchrony $ getConfig phylo
+ docs = phylo ^. phylo_timeDocs
+ diagos = map coocToDiago $ phylo ^. phylo_timeCooc
+ newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
+ $ map processDynamics
+ $ adjustClustering sync
+ $ phyloToLastBranches
+ $ traceSynchronyStart phylo
+ newBranches' = newBranches `using` parList rdeepseq
+ in toNextLevel' phylo $ concat newBranches'
+
+
+-- synchronicDistance :: Phylo -> Level -> String
+-- synchronicDistance phylo lvl =
+-- foldl' (\acc branch ->
+-- acc <> (foldl' (\acc' period ->
+-- acc' <> let prox = phyloProximity $ getConfig phylo
+-- sync = phyloSynchrony $ getConfig phylo
+-- docs = _phylo_timeDocs phylo
+-- prd = _phylo_groupPeriod $ head' "distance" period
+-- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
+-- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
+-- in foldl' (\mem (_,w) ->
+-- mem <> show (prd)
+-- <> "\t"
+-- <> show (w)
+-- <> "\n"
+-- ) "" edges
+-- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
+-- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo