]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
color update
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, any, nub, union)
19 import Data.Map (Map, fromList, toList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
25
26 import Control.Lens hiding (Level)
27
28
29 -------------------
30 -- | Proximity | --
31 -------------------
32
33
34 -- | Process the inverse sumLog
35 sumInvLog :: Double -> [Double] -> Double
36 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
37
38
39 -- | Process the sumLog
40 sumLog :: Double -> [Double] -> Double
41 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
42
43
44 -- | To compute a jaccard similarity between two lists
45 jaccard :: [Int] -> [Int] -> Double
46 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
47
48
49 -- | To process a WeighedLogJaccard distance between to coocurency matrix
50 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
51 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
52 | null ngramsInter = 0
53 | ngramsInter == ngramsUnion = 1
54 | sens == 0 = jaccard ngramsInter ngramsUnion
55 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
56 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
57 where
58 --------------------------------------
59 ngramsInter :: [Int]
60 ngramsInter = intersect ngrams ngrams'
61 --------------------------------------
62 ngramsUnion :: [Int]
63 ngramsUnion = union ngrams ngrams'
64 --------------------------------------
65 coocInter :: [Double]
66 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
67 --------------------------------------
68 coocUnion :: [Double]
69 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
70 --------------------------------------
71
72
73 -- | To choose a proximity function
74 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
75 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
76 WeightedLogJaccard sens -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
77 Hamming -> undefined
78
79
80 -- | To process the proximity between a current group and a pair of targets group
81 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
82 toProximity docs proximity group target target' =
83 let docs' = sum $ elems docs
84 cooc = if target == target'
85 then (target ^. phylo_groupCooc)
86 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
87 ngrams = if target == target'
88 then (target ^. phylo_groupNgrams)
89 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
90 in pickProximity proximity docs' (group ^. phylo_groupCooc) cooc (group ^. phylo_groupNgrams) ngrams
91
92
93 ------------------------
94 -- | Local Matching | --
95 ------------------------
96
97
98 -- | Find pairs of valuable candidates to be matched
99 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> Double -> Map Date Double -> Proximity -> PhyloGroup -> [(PhyloGroup,PhyloGroup)]
100 makePairs candidates periods thr docs proximity group = case null periods of
101 True -> []
102 -- | at least on of the pair candidates should be from the last added period
103 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
104 || (inLastPeriod cdt' periods))
105 $ listToKeys
106 -- | remove poor candidates from previous periods
107 $ filter (\cdt -> (inLastPeriod cdt periods)
108 || ((toProximity (reframeDocs docs periods) proximity group group cdt) >= thr)) candidates
109 where
110 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
111 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
112
113
114 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Double -> Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup
115 phyloGroupMatching candidates fil thr docs proxi group = case pointers of
116 Nothing -> addPointers group fil TemporalPointer []
117 Just pts -> addPointers group fil TemporalPointer
118 $ head' "phyloGroupMatching"
119 -- | Keep only the best set of pointers grouped by proximity
120 $ groupBy (\pt pt' -> snd pt == snd pt')
121 $ reverse $ sortOn snd pts
122 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
123 where
124 pointers :: Maybe [Pointer]
125 pointers = find (not . null)
126 -- | for each time frame, process the proximity on relevant pairs of targeted groups
127 $ scanl (\acc groups ->
128 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
129 pairs = makePairs (concat groups) periods thr docs proxi group
130 in acc ++ ( filter (\(_,proximity) -> proximity >= thr )
131 $ concat
132 $ map (\(c,c') ->
133 -- | process the proximity between the current group and a pair of candidates
134 let proximity = toProximity (reframeDocs docs periods) proxi group c c'
135 in if (c == c')
136 then [(getGroupId c,proximity)]
137 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
138 ) []
139 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
140 $ inits candidates
141
142
143 -----------------------------
144 -- | Adaptative Matching | --
145 -----------------------------
146
147
148 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
149 getNextPeriods fil max pId pIds =
150 case fil of
151 ToChilds -> take max $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
152 ToParents -> take max $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
153
154
155 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
156 getCandidates fil g pIds targets =
157 case fil of
158 ToChilds -> targets'
159 ToParents -> reverse targets'
160 where
161 targets' :: [[PhyloGroup]]
162 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
163 $ filterWithKey (\k _ -> elem k pIds)
164 $ fromListWith (++)
165 $ sortOn (fst . fst)
166 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
167
168
169 shouldBreak :: Double -> [(Double,[PhyloGroup])] -> Bool
170 shouldBreak thr branches = any (\(quality,_) -> quality < thr) branches
171
172
173 toBranchQuality :: [[PhyloGroup]] -> [(Double,[PhyloGroup])]
174 toBranchQuality branches = undefined
175
176
177 reframeDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
178 reframeDocs docs periods = restrictKeys docs $ periodsToYears periods
179
180
181 -- findGhostLinks :: [Link] -> [[Link]] -> Map PhyloGroupId
182
183 adaptativeMatching :: Int -> Double -> Double -> Double -> Map Date Double -> Proximity -> [PhyloGroup] -> [PhyloGroup] -> [PhyloPeriodId] -> [PhyloGroup]
184 adaptativeMatching maxTime thrStep thrMatch thrQua docs proximity groups candidates periods =
185 -- | check if we should break some of the new branches or not
186 case shouldBreak thrQua branches' of
187 True -> concat $ map (\(s,b) ->
188 if s >= thrQua
189 -- | we keep the branch as it is
190 then b
191 -- | we break the branch using an increased temporal matching threshold
192 else let nextGroups = undefined
193 nextCandidates = undefined
194 nextPeriods = undefined
195 in adaptativeMatching maxTime thrStep (thrMatch + thrStep) thrQua
196 (reframeDocs docs nextPeriods)
197 proximity
198 nextGroups nextCandidates nextPeriods
199 ) branches'
200 -- | the quality of all the new branches is sufficient
201 False -> concat branches
202 where
203 -- | 3) process a quality score for each new branch
204 branches' :: [(Double,[PhyloGroup])]
205 branches' = toBranchQuality branches
206 -- | 2) group the new groups into branches
207 branches :: [[PhyloGroup]]
208 branches = relatedComponents groups'
209 -- | 1) connect each group to its parents and childs
210 groups' :: [PhyloGroup]
211 groups' = map (\group ->
212 let childs = getCandidates ToChilds group
213 (getNextPeriods ToChilds maxTime (group ^. phylo_groupPeriod) periods) candidates
214 parents = getCandidates ToParents group
215 (getNextPeriods ToParents maxTime (group ^. phylo_groupPeriod) periods) candidates
216 -- | match the group to its possible childs then parents
217 in phyloGroupMatching parents ToParents thrMatch docs proximity
218 $ phyloGroupMatching childs ToChilds thrMatch docs proximity group
219 ) groups
220
221
222 temporalMatching :: Phylo -> Phylo
223 temporalMatching phylo =
224 let branches = fromList $ map (\g -> (getGroupId g, g))
225 $ adaptativeMatching (maxTimeMatch $ getConfig phylo) 0 0 0
226 (phylo ^. phylo_timeDocs)
227 (phyloProximity $ getConfig phylo)
228 (getGroupsFromLevel 1 phylo) (getGroupsFromLevel 1 phylo) (getPeriodIds phylo)
229 in updatePhyloGroups 1 branches phylo