]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
Merge branch 'dev-phylo' into dev-merge
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union)
19 import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
25
26 import Prelude (logBase)
27 import Control.Lens hiding (Level)
28
29 import qualified Data.Set as Set
30
31 -------------------
32 -- | Proximity | --
33 -------------------
34
35
36 -- | Process the inverse sumLog
37 sumInvLog :: Double -> [Double] -> Double
38 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
39
40
41 -- | Process the sumLog
42 sumLog :: Double -> [Double] -> Double
43 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
44
45
46 -- | To compute a jaccard similarity between two lists
47 jaccard :: [Int] -> [Int] -> Double
48 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
49
50
51 -- | To process a WeighedLogJaccard distance between to coocurency matrix
52 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
53 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
58 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
59 where
60 --------------------------------------
61 ngramsInter :: [Int]
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
64 ngramsUnion :: [Int]
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
67 coocInter :: [Double]
68 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
69 --------------------------------------
70 coocUnion :: [Double]
71 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
72 --------------------------------------
73
74
75 -- | To choose a proximity function
76 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
77 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
78 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
79 Hamming -> undefined
80
81
82 filterProximity :: Proximity -> Double -> Double -> Bool
83 filterProximity proximity thr local =
84 case proximity of
85 WeightedLogJaccard _ _ _ -> local >= thr
86 Hamming -> undefined
87
88
89 -- | To process the proximity between a current group and a pair of targets group
90 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
91 toProximity docs proximity ego target target' =
92 let docs' = sum $ elems docs
93 cooc = if target == target'
94 then (target ^. phylo_groupCooc)
95 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
96 ngrams = if target == target'
97 then (target ^. phylo_groupNgrams)
98 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
99 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
100
101
102 ------------------------
103 -- | Local Matching | --
104 ------------------------
105
106
107 -- | Find pairs of valuable candidates to be matched
108 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
109 makePairs candidates periods = case null periods of
110 True -> []
111 -- | at least on of the pair candidates should be from the last added period
112 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
113 || (inLastPeriod cdt' periods))
114 $ listToKeys candidates
115 where
116 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
117 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
118
119
120 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
121 phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
122 Nothing -> addPointers ego fil TemporalPointer []
123 Just pts -> addPointers ego fil TemporalPointer
124 $ head' "phyloGroupMatching"
125 -- | Keep only the best set of pointers grouped by proximity
126 $ groupBy (\pt pt' -> snd pt == snd pt')
127 $ reverse $ sortOn snd pts
128 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
129 where
130 pointers :: Maybe [Pointer]
131 pointers = find (not . null)
132 -- | for each time frame, process the proximity on relevant pairs of targeted groups
133 $ scanl (\acc groups ->
134 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
135 pairs = makePairs (concat groups) periods
136 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
137 $ concat
138 $ map (\(c,c') ->
139 -- | process the proximity between the current group and a pair of candidates
140 let proximity = toProximity (filterDocs docs periods) proxi ego c c'
141 in if (c == c')
142 then [(getGroupId c,proximity)]
143 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
144 ) []
145 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
146 $ inits candidates
147 --------------------------------------
148 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
149 filterDocs d pds = restrictKeys d $ periodsToYears pds
150
151
152 -----------------------------
153 -- | Matching Processing | --
154 -----------------------------
155
156
157 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
158 getNextPeriods fil max' pId pIds =
159 case fil of
160 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
161 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
162
163
164 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
165 getCandidates fil ego pIds targets =
166 case fil of
167 ToChilds -> targets'
168 ToParents -> reverse targets'
169 where
170 targets' :: [[PhyloGroup]]
171 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
172 $ filterWithKey (\k _ -> elem k pIds)
173 $ fromListWith (++)
174 $ sortOn (fst . fst)
175 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
176
177
178 processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
179 processMatching max' periods proximity thr docs groups =
180 map (\group ->
181 let childs = getCandidates ToChilds group
182 (getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
183 parents = getCandidates ToParents group
184 (getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
185 in phyloGroupMatching parents ToParents proximity docs thr
186 $ phyloGroupMatching childs ToChilds proximity docs thr group
187 ) groups
188
189
190 -----------------------
191 -- | Phylo Quality | --
192 -----------------------
193
194
195 termFreq :: Int -> [[PhyloGroup]] -> Double
196 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
197 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
198
199
200 entropy :: [[PhyloGroup]] -> Double
201 entropy branches =
202 let terms = ngramsInBranches branches
203 in sum $ map (\term -> (1 / log (termFreq term branches))
204 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
205 * (sum $ map (\branch ->
206 let q = branchObs term (length $ concat branches) branch
207 in q * logBase 2 q ) branches) ) terms
208 where
209 -- | Probability to observe a branch given a random term of the phylo
210 branchObs :: Int -> Int -> [PhyloGroup] -> Double
211 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
212 / (fromIntegral total)
213
214
215 homogeneity :: [[PhyloGroup]] -> Double
216 homogeneity branches = undefined
217 where
218 branchCov ::
219
220
221 toPhyloQuality :: [[PhyloGroup]] -> Double
222 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
223
224
225 -----------------------------
226 -- | Adaptative Matching | --
227 -----------------------------
228
229
230 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
231 groupsToBranches groups =
232 -- | run the related component algorithm
233 let graph = zip [1..]
234 $ relatedComponents
235 $ map (\group -> [getGroupId group]
236 ++ (map fst $ group ^. phylo_groupPeriodParents)
237 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
238 -- | update each group's branch id
239 in map (\(bId,ids) ->
240 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
241 $ elems $ restrictKeys groups (Set.fromList ids)
242 ) graph
243
244
245 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup]
246 recursiveMatching proximity thr max' periods docs quality groups =
247 case quality < quality' of
248 -- | success : we localy improve the quality of the branch, let's go deeper
249 True -> concat
250 $ map (\branch ->
251 recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch
252 ) branches
253 -- | failure : last step was the local maximum, let's validate it
254 False -> groups
255 where
256 -- | 3) process a quality score on the local set of branches
257 quality' :: Double
258 quality' = toPhyloQuality branches
259 -- | 2) group the new groups into branches
260 branches :: [[PhyloGroup]]
261 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
262 -- | 1) process a temporal matching for each group
263 groups' :: [PhyloGroup]
264 groups' = processMatching max' periods proximity thr docs groups
265
266
267 temporalMatching :: Phylo -> Phylo
268 temporalMatching phylo = updatePhyloGroups 1 branches phylo
269 where
270 -- | 2) run the recursive matching to find the best repartition among branches
271 branches :: Map PhyloGroupId PhyloGroup
272 branches = fromList
273 $ map (\g -> (getGroupId g, g))
274 $ recursiveMatching (phyloProximity $ getConfig phylo)
275 (getThresholdInit $ phyloProximity $ getConfig phylo)
276 (getTimeFrame $ timeUnit $ getConfig phylo)
277 (getPeriodIds phylo)
278 (phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups'
279 -- | 1) for each group process an initial temporal Matching
280 groups' :: [PhyloGroup]
281 groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
282 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
283 (phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)