]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/BranchMaker.hs
some refactoring & adding generality, specifivity, etc for the ngrams
[gargantext.git] / src / Gargantext / Viz / Phylo / BranchMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.BranchMaker
18 where
19
20 import Control.Parallel.Strategies
21 import Control.Lens hiding (both, Level)
22 import Data.List (concat,nub,(++),sortOn,reverse,sort,null,intersect,union,delete)
23 import Data.Map (Map,(!), fromListWith, elems)
24 import Data.Tuple (fst, snd)
25 import Gargantext.Prelude
26 import Gargantext.Viz.Phylo
27 import Gargantext.Viz.Phylo.Cluster
28 import Gargantext.Viz.Phylo.Aggregates
29 import Gargantext.Viz.Phylo.Metrics
30 import Gargantext.Viz.Phylo.Tools
31 import Gargantext.Viz.Phylo.LinkMaker
32
33 import qualified Data.Map as Map
34
35 -- import Debug.Trace (trace)
36
37 ---------------------------
38 -- | Readability links | --
39 ---------------------------
40
41 getGroupsPeriods :: [PhyloGroup] -> [(Date,Date)]
42 getGroupsPeriods gs = sortOn fst $ nub $ map getGroupPeriod gs
43
44 getFramedPeriod :: [PhyloGroup] -> (Date,Date)
45 getFramedPeriod gs = (fst $ (head' "getFramedPeriod" $ getGroupsPeriods gs), snd $ (last' "getFramedPeriod" $ getGroupsPeriods gs))
46
47
48 getGroupsNgrams :: [PhyloGroup] -> [Int]
49 getGroupsNgrams gs = (sort . nub . concat) $ map getGroupNgrams gs
50
51
52 areDistant :: (Date,Date) -> (Date,Date) -> Int -> Bool
53 areDistant prd prd' thr = (((fst prd') - (snd prd)) > thr) || (((fst prd) - (snd prd')) > thr)
54
55
56 -- | Process a Jaccard on top of two set of Branch Peaks
57 areTwinPeaks :: Double -> [Int] -> [Int] -> Bool
58 areTwinPeaks thr ns ns' = ( ((fromIntegral . length) $ intersect ns ns')
59 / ((fromIntegral . length) $ union ns ns')) >= thr
60
61
62 -- | Get the framing period of a branch ([[PhyloGroup]])
63 getBranchPeriod :: [PhyloGroup] -> (Date,Date)
64 getBranchPeriod gs =
65 let dates = sort $ foldl (\mem g -> mem ++ [fst $ getGroupPeriod g, snd $ getGroupPeriod g]) [] gs
66 in (head' "getBranchPeriod" dates, last' "getBranchPeriod" dates)
67
68
69 -- | Get the Nth most coocurent Ngrams in a list of Groups
70 getGroupsPeaks :: [PhyloGroup] -> Int -> Phylo -> [Int]
71 getGroupsPeaks gs nth p = getNthMostOcc nth
72 $ getSubCooc (getGroupsNgrams gs)
73 $ getCooc (getGroupsPeriods gs) p
74
75
76 -- | Reduce a list of branches ([[Phylogroup]]) into possible candidates for rebranching
77 filterSimBranches :: [PhyloGroup] -> Phylo -> [[PhyloGroup]] -> [[PhyloGroup]]
78 filterSimBranches gs p branches = filter (\gs' -> (areTwinPeaks (getPhyloReBranchThr p)
79 (getGroupsPeaks gs (getPhyloReBranchNth p) p)
80 (getGroupsPeaks gs' (getPhyloReBranchNth p) p))
81 && ((not . null) $ intersect (map getGroupNgrams gs') (map getGroupNgrams gs))
82 && (areDistant (getBranchPeriod gs) (getBranchPeriod gs') (getPhyloMatchingFrame p))
83 ) branches
84
85
86 -- | Try to connect a focused branch to other candidate branches by finding the best pointers
87 reBranch :: Phylo -> [PhyloGroup] -> [[PhyloGroup]] -> [(PhyloGroupId,Pointer)]
88 reBranch p branch candidates =
89 let newLinks = map (\branch' ->
90 let pointers = map (\g ->
91 -- define pairs of candidates groups
92 let pairs = listToPairs
93 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams g') (getGroupNgrams g)) branch'
94 -- process the matching between the pairs and the current group
95 in foldl' (\mem (g2,g3) -> let s = 0.1 + matchWithPairs g (g2,g3) p
96 in if (g2 == g3)
97 then mem ++ [(getGroupId g,(getGroupId g2,s))]
98 else mem ++ [(getGroupId g,(getGroupId g2,s)),(getGroupId g,(getGroupId g3,s))]) [] pairs
99 ) branch
100 pointers' = pointers `using` parList rdeepseq
101 -- keep the best pointer between the focused branch and the current candidates
102 in head' "reBranch" $ reverse $ sortOn (snd . snd)
103 $ filter (\(_,(_,s)) -> filterProximity s $ getPhyloProximity p) $ concat pointers'
104 ) candidates
105 newLinks' = newLinks `using` parList rdeepseq
106 in newLinks'
107
108
109 reLinkPhyloBranches :: Level -> Phylo -> Phylo
110 reLinkPhyloBranches lvl p =
111 let pointers = Map.fromList $ map (\(_id,(_id',_s)) -> (_id,[(_id',100)])) $ fst
112 $ foldl' (\(pts,branches') gs -> (pts ++ (reBranch p gs (filterSimBranches gs p branches')), delete gs branches'))
113 ([],branches) branches
114 in setPhyloBranches lvl $ updateGroups Descendant lvl pointers p
115 where
116 branches :: [[PhyloGroup]]
117 branches = elems
118 $ fromListWith (++)
119 $ foldl' (\mem g -> case getGroupBranchId g of
120 Nothing -> mem
121 Just i -> mem ++ [(i,[g])] )
122 [] $ getGroupsWithLevel lvl p
123
124
125 ------------------
126 -- | Branches | --
127 ------------------
128
129
130 -- | To transform a PhyloGraph into a list of PhyloBranches by using the relatedComp clustering
131 graphToBranches :: [PhyloGroup] -> Map PhyloGroupId Int
132 graphToBranches groups = Map.fromList
133 $ concat
134 $ map (\(idx,gIds) -> map (\id -> (id,idx)) gIds)
135 $ zip [1..]
136 $ relatedComp
137 $ map (\g -> [getGroupId g] ++ (getGroupPeriodParentsId g) ++ (getGroupPeriodChildsId g)) groups
138
139
140 -- | To set all the PhyloBranches for a given Level in a Phylo
141 setPhyloBranches :: Level -> Phylo -> Phylo
142 setPhyloBranches lvl p = alterGroupWithLevel (\g ->
143 let bIdx = branches ! (getGroupId g)
144 in over (phylo_groupBranchId) (\_ -> Just (lvl,bIdx)) g) lvl p
145 where
146 --------------------------------------
147 branches :: Map PhyloGroupId Int
148 branches = graphToBranches (getGroupsWithLevel lvl p)
149 --------------------------------------
150
151
152 -- trace' bs = trace bs