]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/LinkMaker.hs
[FIX] fun + papers read for our purpose
[gargantext.git] / src / Gargantext / Viz / Phylo / LinkMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13
14 module Gargantext.Viz.Phylo.LinkMaker
15 where
16
17 import Control.Parallel.Strategies
18 import Control.Lens hiding (both, Level)
19 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, delete, intersect, elemIndex, groupBy, union, inits, scanl, find)
20 import Data.Tuple.Extra
21 import Data.Map (Map, (!), fromListWith, elems, restrictKeys, filterWithKey, keys, unionWith, unions, intersectionWith, member, fromList)
22 import Gargantext.Prelude
23 import Gargantext.Viz.Phylo
24 import Gargantext.Viz.Phylo.Tools
25 import Gargantext.Viz.Phylo.Metrics
26 import qualified Data.List as List
27 import qualified Data.Maybe as Maybe
28 import qualified Data.Map as Map
29
30 import qualified Data.Vector.Storable as VS
31 import Debug.Trace (trace)
32 import Numeric.Statistics (percentile)
33
34 -----------------------------
35 -- | From Level to level | --
36 -----------------------------
37
38
39 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
40 linkGroupToGroups :: PhyloGroup -> [PhyloGroup] -> PhyloGroup
41 linkGroupToGroups current targets = over (phylo_groupLevelParents) addPointers current
42 where
43 --------------------------------------
44 addPointers :: [Pointer] -> [Pointer]
45 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
46 if (elem (getGroupId current) (getGroupLevelChildsId target))
47 then Just ((getGroupId target),1)
48 else Nothing) targets
49 --------------------------------------
50
51
52 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
53 setLevelLinks (lvl,lvl') p = alterGroupWithLevel (\group -> linkGroupToGroups group
54 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams group) (getGroupNgrams g'))
55 $ getGroupsWithFilters lvl' (getGroupPeriod group) p) lvl p
56
57
58 -------------------------------
59 -- | From Period to Period | --
60 -------------------------------
61
62
63 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
64 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
65 getNextPeriods to' limit id l = case to' of
66 Descendant -> take limit $ (tail . snd) next
67 Ascendant -> take limit $ (reverse . fst) next
68 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
69 where
70 --------------------------------------
71 next :: ([PhyloPeriodId], [PhyloPeriodId])
72 next = splitAt idx l
73 --------------------------------------
74 idx :: Int
75 idx = case (List.elemIndex id l) of
76 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
77 Just i -> i
78 --------------------------------------
79
80
81 -- | To get the number of docs produced during a list of periods
82 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
83 periodsToNbDocs prds phylo = sum $ elems
84 $ restrictKeys (phylo ^. phylo_docsByYears)
85 $ periodsToYears prds
86
87
88 -- | To process a given Proximity
89 processProximity :: Proximity -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
90 processProximity proximity nbDocs cooc cooc' ngrams ngrams' = case proximity of
91 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
92 Hamming (HammingParams _) -> hamming cooc cooc'
93 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
94
95
96 filterProximity :: Double -> Proximity -> Bool
97 filterProximity score prox = case prox of
98 WeightedLogJaccard (WLJParams thr _) -> score >= thr
99 Hamming (HammingParams thr) -> score <= thr
100 _ -> panic "[ERR][Viz.Phylo.LinkMaker.filterProximity] Unknown proximity"
101
102
103 makePairs :: [(Date,Date)] -> PhyloGroup -> Phylo -> [(PhyloGroup,PhyloGroup)]
104 makePairs prds g p = filter (\pair -> ((last' "makePairs" prds) == (getGroupPeriod $ fst pair))
105 || ((last' "makePairs" prds) == (getGroupPeriod $ snd pair)))
106 $ listToPairs
107 $ filter (\g' -> (elem (getGroupPeriod g') prds)
108 && ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
109 && (((last' "makePairs" prds) == (getGroupPeriod g))
110 ||((matchWithPairs g (g,g') p) >= (getPhyloMatchingFrameTh p))))
111 $ getGroupsWithLevel (getGroupLevel g) p
112
113 matchWithPairs :: PhyloGroup -> (PhyloGroup,PhyloGroup) -> Phylo -> Double
114 matchWithPairs g1 (g2,g3) p =
115 let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] p
116 cooc = if (g2 == g3)
117 then getGroupCooc g2
118 else unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
119 ngrams = if (g2 == g3)
120 then getGroupNgrams g2
121 else union (getGroupNgrams g2) (getGroupNgrams g3)
122 in processProximity (getPhyloProximity p) nbDocs (getGroupCooc g1) cooc (getGroupNgrams g1) ngrams
123
124
125 phyloGroupMatching :: [PhyloPeriodId] -> PhyloGroup -> Phylo -> [Pointer]
126 phyloGroupMatching periods g p = case pointers of
127 Nothing -> []
128 Just pts -> head' "phyloGroupMatching"
129 -- | Keep only the best set of pointers grouped by proximity
130 $ groupBy (\pt pt' -> snd pt == snd pt')
131 $ reverse $ sortOn snd pts
132 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
133 where
134 --------------------------------------
135 pointers :: Maybe [Pointer]
136 pointers = find (not . null)
137 -- | For each time frame, process the Proximity on relevant pairs of targeted groups
138 $ scanl (\acc frame ->
139 let pairs = makePairs frame g p
140 in acc ++ ( filter (\(_,proxi) -> filterProximity proxi (getPhyloProximity p))
141 $ concat
142 $ map (\(t,t') ->
143 let proxi = matchWithPairs g (t,t') p
144 in
145 if (t == t')
146 then [(getGroupId t,proxi)]
147 else [(getGroupId t,proxi),(getGroupId t',proxi)] ) pairs ) ) []
148 -- | [[1900],[1900,1901],[1900,1901,1902],...] | length max => + 5 years
149 $ inits periods
150 --------------------------------------
151
152
153 -- | To add some Pointer to a PhyloGroup
154 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
155 addPointers' fil pts g = g & case fil of
156 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
157 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
158 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
159
160
161
162 -- | To update a list of phyloGroups with some Pointers
163 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
164 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
165 then addPointers' fil (m ! (getGroupId g)) g
166 else g ) gs) p
167
168
169
170 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
171 initCandidates :: PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [PhyloGroup]
172 initCandidates g prds gs = filter (\g' -> elem (getGroupPeriod g') prds)
173 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
174 $ delete g gs
175
176
177 -- | a init avec la [[head groups]] et la tail groups
178 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
179 toBranches mem gs
180 | null gs = mem
181 | otherwise = toBranches mem' $ tail gs
182 where
183 --------------------------------------
184 mem' :: [[PhyloGroup]]
185 mem' = if (null withHead)
186 then mem ++ [[head' "toBranches" gs]]
187 else (filter (\gs' -> not $ elem gs' withHead) mem)
188 ++
189 [(concat withHead) ++ [head' "toBranches" gs]]
190 --------------------------------------
191 withHead :: [[PhyloGroup]]
192 withHead = filter (\gs' -> (not . null)
193 $ intersect (concat $ map getGroupNgrams gs')
194 (getGroupNgrams $ (head' "toBranches" gs))
195 ) mem
196 --------------------------------------
197
198
199 -- | To process an intertemporal matching task to a Phylo at a given level
200 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
201 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
202 -- | 3) update all the groups with the new pointers if they exist
203 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
204 interTempoMatching fil lvl _ p = updateGroups fil lvl (Map.fromList pointers) p
205 where
206 --------------------------------------
207 pointers :: [(PhyloGroupId,[Pointer])]
208 pointers =
209 let pts = map (\g -> let periods = getNextPeriods fil (getPhyloMatchingFrame p) (getGroupPeriod g) (getPhyloPeriods p)
210 in (getGroupId g, phyloGroupMatching periods g p)) groups
211 pts' = pts `using` parList rdeepseq
212 in pts'
213 --------------------------------------
214 groups :: [PhyloGroup]
215 groups = getGroupsWithLevel lvl p
216 --------------------------------------
217
218
219 ------------------------------------------------------------------------
220 -- | Make links from Period to Period after level 1
221
222
223 listToTuple :: (a -> b) -> [a] -> [(b,a)]
224 listToTuple f l = map (\x -> (f x, x)) l
225
226
227 groupsToMaps :: Ord b => (PhyloGroup -> b) -> [PhyloGroup] -> [Map PhyloGroupId PhyloGroup]
228 groupsToMaps f gs = map (\gs' -> fromList $ listToTuple getGroupId gs')
229 $ groupBy ((==) `on` f)
230 $ sortOn f gs
231
232
233 phyloToPeriodMaps :: Level -> Filiation -> Phylo -> [Map PhyloGroupId PhyloGroup]
234 phyloToPeriodMaps lvl fil p =
235 let prdMap = groupsToMaps (fst . getGroupPeriod) (getGroupsWithLevel lvl p)
236 in case fil of
237 Ascendant -> reverse prdMap
238 Descendant -> prdMap
239 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.phyloToPeriodMaps] Wrong type of filiation")
240
241
242 trackPointersRec :: Filiation -> Map PhyloGroupId PhyloGroup -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
243 trackPointersRec fil m gs res =
244 if (null gs) then res
245 else if (Map.null m) then res ++ gs
246 else
247 let g = head' "track" gs
248 pts = Map.fromList $ getGroupPointers PeriodEdge fil g
249 pts' = Map.toList $ fromListWith (\w w' -> max w w') $ concat $ elems
250 $ intersectionWith (\w g' -> map (\(id,_w') -> (id, w))
251 $ getGroupPointers LevelEdge Ascendant g') pts m
252 res' = res ++ [case fil of
253 Ascendant -> g & phylo_groupPeriodParents .~ pts'
254 Descendant -> g & phylo_groupPeriodChilds .~ pts'
255 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.transposeLinks] Wrong type of filiation")]
256 in trackPointersRec fil (filterWithKey (\k _ -> not $ elem k (keys pts)) m) (tail' "track" gs) res'
257
258
259
260 transposeLinks :: Level -> Filiation -> Phylo -> Phylo
261 transposeLinks lvl fil p =
262 let prdMap = zip (phyloToPeriodMaps (lvl - 1) fil p) (phyloToPeriodMaps lvl fil p)
263 transposed = map (\(gs,gs') ->
264 let idx = fromJust $ elemIndex (gs,gs') prdMap
265 next = take (getPhyloMatchingFrame p) $ snd $ splitAt (idx + 1) prdMap
266 groups = trackPointersRec fil (unions $ map fst next) (elems gs') []
267 in (getGroupPeriod $ head' "transpose" groups ,groups)
268 ) prdMap
269 transposed' = Map.fromList $ (transposed `using` parList rdeepseq)
270 in alterPhyloGroups
271 (\gs -> if ((not . null) gs) && (lvl == (getGroupLevel $ head' "transpose" gs))
272 then transposed' ! (getGroupPeriod $ head' "transpose" gs)
273 else gs
274 ) p
275
276
277
278 -- | Transpose the parent/child pointers from one level to another
279 transposePeriodLinks :: Level -> Phylo -> Phylo
280 transposePeriodLinks lvl p = alterPhyloGroups
281 (\gs -> if ((not . null) gs) && (elem lvl $ map getGroupLevel gs)
282 then
283 let groups = map (\g -> let m = reduceGroups g lvlGroups
284 in g & phylo_groupPeriodParents .~ (trackPointers m $ g ^. phylo_groupPeriodParents)
285 & phylo_groupPeriodChilds .~ (trackPointers m $ g ^. phylo_groupPeriodChilds )) gs
286 groups' = groups `using` parList rdeepseq
287 in groups'
288 else gs
289 ) p
290 where
291 --------------------------------------
292 -- | find an other way to find the group from the id
293 trackPointers :: Map PhyloGroupId PhyloGroup -> [Pointer] -> [Pointer]
294 trackPointers m pts = Map.toList
295 $ fromListWith (\w w' -> max w w')
296 $ map (\(id,_w) -> (getGroupLevelParentId $ m ! id,_w)) pts
297 --------------------------------------
298 reduceGroups :: PhyloGroup -> [PhyloGroup] -> Map PhyloGroupId PhyloGroup
299 reduceGroups g gs = Map.fromList
300 $ map (\g' -> (getGroupId g',g'))
301 $ filter (\g' -> ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))) gs
302 --------------------------------------
303 lvlGroups :: [PhyloGroup]
304 lvlGroups = getGroupsWithLevel (lvl - 1) p
305 --------------------------------------
306
307
308 ----------------
309 -- | Tracer | --
310 ----------------
311
312 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
313 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
314 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
315 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
316 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
317 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
318 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
319
320
321 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
322 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
323 <> "with sizes : " <> show (map length bs) <> "\n") bs
324