]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
[ELEVE] main specs more generic.
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union)
19 import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
25
26 import Prelude (logBase)
27 import Control.Lens hiding (Level)
28
29 import qualified Data.Set as Set
30
31 -------------------
32 -- | Proximity | --
33 -------------------
34
35
36 -- | Process the inverse sumLog
37 sumInvLog :: Double -> [Double] -> Double
38 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
39
40
41 -- | Process the sumLog
42 sumLog :: Double -> [Double] -> Double
43 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
44
45
46 -- | To compute a jaccard similarity between two lists
47 jaccard :: [Int] -> [Int] -> Double
48 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
49
50
51 -- | To process a WeighedLogJaccard distance between to coocurency matrix
52 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
53 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
58 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
59 where
60 --------------------------------------
61 ngramsInter :: [Int]
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
64 ngramsUnion :: [Int]
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
67 coocInter :: [Double]
68 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
69 --------------------------------------
70 coocUnion :: [Double]
71 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
72 --------------------------------------
73
74
75 -- | To choose a proximity function
76 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
77 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
78 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
79 Hamming -> undefined
80
81
82 filterProximity :: Proximity -> Double -> Double -> Bool
83 filterProximity proximity thr local =
84 case proximity of
85 WeightedLogJaccard _ _ _ -> local >= thr
86 Hamming -> undefined
87
88
89 -- | To process the proximity between a current group and a pair of targets group
90 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
91 toProximity docs proximity ego target target' =
92 let docs' = sum $ elems docs
93 cooc = if target == target'
94 then (target ^. phylo_groupCooc)
95 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
96 ngrams = if target == target'
97 then (target ^. phylo_groupNgrams)
98 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
99 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
100
101
102 ------------------------
103 -- | Local Matching | --
104 ------------------------
105
106
107 -- | Find pairs of valuable candidates to be matched
108 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
109 makePairs candidates periods = case null periods of
110 True -> []
111 -- | at least on of the pair candidates should be from the last added period
112 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
113 || (inLastPeriod cdt' periods))
114 $ listToKeys candidates
115 where
116 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
117 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
118
119
120 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
121 phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
122 Nothing -> addPointers ego fil TemporalPointer []
123 Just pts -> addPointers ego fil TemporalPointer
124 $ head' "phyloGroupMatching"
125 -- | Keep only the best set of pointers grouped by proximity
126 $ groupBy (\pt pt' -> snd pt == snd pt')
127 $ reverse $ sortOn snd pts
128 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
129 where
130 pointers :: Maybe [Pointer]
131 pointers = find (not . null)
132 -- | for each time frame, process the proximity on relevant pairs of targeted groups
133 $ scanl (\acc groups ->
134 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
135 pairs = makePairs (concat groups) periods
136 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
137 $ concat
138 $ map (\(c,c') ->
139 -- | process the proximity between the current group and a pair of candidates
140 let proximity = toProximity (filterDocs docs periods) proxi ego c c'
141 in if (c == c')
142 then [(getGroupId c,proximity)]
143 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
144 ) []
145 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
146 $ inits candidates
147 --------------------------------------
148 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
149 filterDocs d pds = restrictKeys d $ periodsToYears pds
150
151
152 -----------------------------
153 -- | Matching Processing | --
154 -----------------------------
155
156
157 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
158 getNextPeriods fil max' pId pIds =
159 case fil of
160 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
161 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
162
163
164 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
165 getCandidates fil ego pIds targets =
166 case fil of
167 ToChilds -> targets'
168 ToParents -> reverse targets'
169 where
170 targets' :: [[PhyloGroup]]
171 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
172 $ filterWithKey (\k _ -> elem k pIds)
173 $ fromListWith (++)
174 $ sortOn (fst . fst)
175 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
176
177
178 processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
179 processMatching max' periods proximity thr docs groups =
180 map (\group ->
181 let childs = getCandidates ToChilds group
182 (getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
183 parents = getCandidates ToParents group
184 (getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
185 in phyloGroupMatching parents ToParents proximity docs thr
186 $ phyloGroupMatching childs ToChilds proximity docs thr group
187 ) groups
188
189
190 -----------------------
191 -- | Phylo Quality | --
192 -----------------------
193
194
195 termFreq :: Int -> [[PhyloGroup]] -> Double
196 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
197 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
198
199
200 entropy :: [[PhyloGroup]] -> Double
201 entropy branches =
202 let terms = ngramsInBranches branches
203 in sum $ map (\term -> (1 / log (termFreq term branches))
204 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
205 * (sum $ map (\branch ->
206 let q = branchObs term (length $ concat branches) branch
207 in q * logBase 2 q ) branches) ) terms
208 where
209 -- | Probability to observe a branch given a random term of the phylo
210 branchObs :: Int -> Int -> [PhyloGroup] -> Double
211 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
212 / (fromIntegral total)
213
214
215 homogeneity :: [[PhyloGroup]] -> Double
216 homogeneity _ = undefined
217 -- where
218 -- branchCov :: [PhyloGroup] -> Int -> Double
219 -- branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
220
221
222 toPhyloQuality :: [[PhyloGroup]] -> Double
223 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
224
225
226 -----------------------------
227 -- | Adaptative Matching | --
228 -----------------------------
229
230
231 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
232 groupsToBranches groups =
233 -- | run the related component algorithm
234 let graph = zip [1..]
235 $ relatedComponents
236 $ map (\group -> [getGroupId group]
237 ++ (map fst $ group ^. phylo_groupPeriodParents)
238 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
239 -- | update each group's branch id
240 in map (\(bId,ids) ->
241 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
242 $ elems $ restrictKeys groups (Set.fromList ids)
243 ) graph
244
245
246 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup]
247 recursiveMatching proximity thr max' periods docs quality groups =
248 case quality < quality' of
249 -- | success : we localy improve the quality of the branch, let's go deeper
250 True -> concat
251 $ map (\branch ->
252 recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch
253 ) branches
254 -- | failure : last step was the local maximum, let's validate it
255 False -> groups
256 where
257 -- | 3) process a quality score on the local set of branches
258 quality' :: Double
259 quality' = toPhyloQuality branches
260 -- | 2) group the new groups into branches
261 branches :: [[PhyloGroup]]
262 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
263 -- | 1) process a temporal matching for each group
264 groups' :: [PhyloGroup]
265 groups' = processMatching max' periods proximity thr docs groups
266
267
268 temporalMatching :: Phylo -> Phylo
269 temporalMatching phylo = updatePhyloGroups 1 branches phylo
270 where
271 -- | 2) run the recursive matching to find the best repartition among branches
272 branches :: Map PhyloGroupId PhyloGroup
273 branches = fromList
274 $ map (\g -> (getGroupId g, g))
275 $ recursiveMatching (phyloProximity $ getConfig phylo)
276 (getThresholdInit $ phyloProximity $ getConfig phylo)
277 (getTimeFrame $ timeUnit $ getConfig phylo)
278 (getPeriodIds phylo)
279 (phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups'
280 -- | 1) for each group process an initial temporal Matching
281 groups' :: [PhyloGroup]
282 groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
283 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
284 (phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)