]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/LinkMaker.hs
Merge branch 'dev' into dev-phylo
[gargantext.git] / src / Gargantext / Viz / Phylo / LinkMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.LinkMaker
18 where
19
20 import Control.Parallel.Strategies
21 import Control.Lens hiding (both, Level)
22 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, delete, intersect, elemIndex, groupBy, union, inits, scanl, find)
23 import Data.Tuple.Extra
24 import Data.Map (Map, (!), fromListWith, elems, restrictKeys, filterWithKey, keys, unionWith, unions, intersectionWith, member, fromList)
25 import Gargantext.Prelude
26 import Gargantext.Viz.Phylo
27 import Gargantext.Viz.Phylo.Tools
28 import Gargantext.Viz.Phylo.Metrics
29 import qualified Data.List as List
30 import qualified Data.Maybe as Maybe
31 import qualified Data.Map as Map
32
33 import qualified Data.Vector.Storable as VS
34 import Debug.Trace (trace)
35 import Numeric.Statistics (percentile)
36
37 -----------------------------
38 -- | From Level to level | --
39 -----------------------------
40
41
42 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
43 linkGroupToGroups :: PhyloGroup -> [PhyloGroup] -> PhyloGroup
44 linkGroupToGroups current targets = over (phylo_groupLevelParents) addPointers current
45 where
46 --------------------------------------
47 addPointers :: [Pointer] -> [Pointer]
48 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
49 if (elem (getGroupId current) (getGroupLevelChildsId target))
50 then Just ((getGroupId target),1)
51 else Nothing) targets
52 --------------------------------------
53
54
55 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
56 setLevelLinks (lvl,lvl') p = alterGroupWithLevel (\group -> linkGroupToGroups group
57 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams group) (getGroupNgrams g'))
58 $ getGroupsWithFilters lvl' (getGroupPeriod group) p) lvl p
59
60
61 -------------------------------
62 -- | From Period to Period | --
63 -------------------------------
64
65
66 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
67 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
68 getNextPeriods to' limit id l = case to' of
69 Descendant -> take limit $ (tail . snd) next
70 Ascendant -> take limit $ (reverse . fst) next
71 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
72 where
73 --------------------------------------
74 next :: ([PhyloPeriodId], [PhyloPeriodId])
75 next = splitAt idx l
76 --------------------------------------
77 idx :: Int
78 idx = case (List.elemIndex id l) of
79 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
80 Just i -> i
81 --------------------------------------
82
83
84 -- | To get the number of docs produced during a list of periods
85 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
86 periodsToNbDocs prds phylo = sum $ elems
87 $ restrictKeys (phylo ^. phylo_docsByYears)
88 $ periodsToYears prds
89
90
91 -- | To process a given Proximity
92 processProximity :: Proximity -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
93 processProximity proximity nbDocs cooc cooc' ngrams ngrams' = case proximity of
94 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
95 Hamming (HammingParams _) -> hamming cooc cooc'
96 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
97
98
99 filterProximity :: Double -> Proximity -> Bool
100 filterProximity score prox = case prox of
101 WeightedLogJaccard (WLJParams thr _) -> score >= thr
102 Hamming (HammingParams thr) -> score <= thr
103 _ -> panic "[ERR][Viz.Phylo.LinkMaker.filterProximity] Unknown proximity"
104
105
106 makePairs :: [(Date,Date)] -> PhyloGroup -> Phylo -> [(PhyloGroup,PhyloGroup)]
107 makePairs prds g p = filter (\pair -> ((last' "makePairs" prds) == (getGroupPeriod $ fst pair))
108 || ((last' "makePairs" prds) == (getGroupPeriod $ snd pair)))
109 $ listToPairs
110 $ filter (\g' -> (elem (getGroupPeriod g') prds)
111 && ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
112 && (((last' "makePairs" prds) == (getGroupPeriod g))
113 ||((matchWithPairs g (g,g') p) >= (getPhyloMatchingFrameTh p))))
114 $ getGroupsWithLevel (getGroupLevel g) p
115
116 matchWithPairs :: PhyloGroup -> (PhyloGroup,PhyloGroup) -> Phylo -> Double
117 matchWithPairs g1 (g2,g3) p =
118 let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] p
119 cooc = if (g2 == g3)
120 then getGroupCooc g2
121 else unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
122 ngrams = if (g2 == g3)
123 then getGroupNgrams g2
124 else union (getGroupNgrams g2) (getGroupNgrams g3)
125 in processProximity (getPhyloProximity p) nbDocs (getGroupCooc g1) cooc (getGroupNgrams g1) ngrams
126
127
128 phyloGroupMatching :: [PhyloPeriodId] -> PhyloGroup -> Phylo -> [Pointer]
129 phyloGroupMatching periods g p = case pointers of
130 Nothing -> []
131 Just pts -> head' "phyloGroupMatching"
132 -- | Keep only the best set of pointers grouped by proximity
133 $ groupBy (\pt pt' -> snd pt == snd pt')
134 $ reverse $ sortOn snd pts
135 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
136 where
137 --------------------------------------
138 pointers :: Maybe [Pointer]
139 pointers = find (not . null)
140 -- | For each time frame, process the Proximity on relevant pairs of targeted groups
141 $ scanl (\acc frame ->
142 let pairs = makePairs frame g p
143 in acc ++ ( filter (\(_,proxi) -> filterProximity proxi (getPhyloProximity p))
144 $ concat
145 $ map (\(t,t') ->
146 let proxi = matchWithPairs g (t,t') p
147 in
148 if (t == t')
149 then [(getGroupId t,proxi)]
150 else [(getGroupId t,proxi),(getGroupId t',proxi)] ) pairs ) ) []
151 -- | [[1900],[1900,1901],[1900,1901,1902],...] | length max => + 5 years
152 $ inits periods
153 --------------------------------------
154
155
156 -- | To add some Pointer to a PhyloGroup
157 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
158 addPointers' fil pts g = g & case fil of
159 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
160 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
161 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
162
163
164
165 -- | To update a list of phyloGroups with some Pointers
166 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
167 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
168 then addPointers' fil (m ! (getGroupId g)) g
169 else g ) gs) p
170
171
172
173 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
174 initCandidates :: PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [PhyloGroup]
175 initCandidates g prds gs = filter (\g' -> elem (getGroupPeriod g') prds)
176 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
177 $ delete g gs
178
179
180 -- | a init avec la [[head groups]] et la tail groups
181 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
182 toBranches mem gs
183 | null gs = mem
184 | otherwise = toBranches mem' $ tail gs
185 where
186 --------------------------------------
187 mem' :: [[PhyloGroup]]
188 mem' = if (null withHead)
189 then mem ++ [[head' "toBranches" gs]]
190 else (filter (\gs' -> not $ elem gs' withHead) mem)
191 ++
192 [(concat withHead) ++ [head' "toBranches" gs]]
193 --------------------------------------
194 withHead :: [[PhyloGroup]]
195 withHead = filter (\gs' -> (not . null)
196 $ intersect (concat $ map getGroupNgrams gs')
197 (getGroupNgrams $ (head' "toBranches" gs))
198 ) mem
199 --------------------------------------
200
201
202 -- | To process an intertemporal matching task to a Phylo at a given level
203 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
204 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
205 -- | 3) update all the groups with the new pointers if they exist
206 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
207 interTempoMatching fil lvl _ p = updateGroups fil lvl (Map.fromList pointers) p
208 where
209 --------------------------------------
210 pointers :: [(PhyloGroupId,[Pointer])]
211 pointers =
212 let pts = map (\g -> let periods = getNextPeriods fil (getPhyloMatchingFrame p) (getGroupPeriod g) (getPhyloPeriods p)
213 in (getGroupId g, phyloGroupMatching periods g p)) groups
214 pts' = pts `using` parList rdeepseq
215 in pts'
216 --------------------------------------
217 groups :: [PhyloGroup]
218 groups = getGroupsWithLevel lvl p
219 --------------------------------------
220
221
222 ------------------------------------------------------------------------
223 -- | Make links from Period to Period after level 1
224
225
226 listToTuple :: (a -> b) -> [a] -> [(b,a)]
227 listToTuple f l = map (\x -> (f x, x)) l
228
229
230 groupsToMaps :: Ord b => (PhyloGroup -> b) -> [PhyloGroup] -> [Map PhyloGroupId PhyloGroup]
231 groupsToMaps f gs = map (\gs' -> fromList $ listToTuple getGroupId gs')
232 $ groupBy ((==) `on` f)
233 $ sortOn f gs
234
235
236 phyloToPeriodMaps :: Level -> Filiation -> Phylo -> [Map PhyloGroupId PhyloGroup]
237 phyloToPeriodMaps lvl fil p =
238 let prdMap = groupsToMaps (fst . getGroupPeriod) (getGroupsWithLevel lvl p)
239 in case fil of
240 Ascendant -> reverse prdMap
241 Descendant -> prdMap
242 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.phyloToPeriodMaps] Wrong type of filiation")
243
244
245 trackPointersRec :: Filiation -> Map PhyloGroupId PhyloGroup -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
246 trackPointersRec fil m gs res =
247 if (null gs) then res
248 else if (Map.null m) then res ++ gs
249 else
250 let g = head' "track" gs
251 pts = Map.fromList $ getGroupPointers PeriodEdge fil g
252 pts' = Map.toList $ fromListWith (\w w' -> max w w') $ concat $ elems
253 $ intersectionWith (\w g' -> map (\(id,_w') -> (id, w))
254 $ getGroupPointers LevelEdge Ascendant g') pts m
255 res' = res ++ [case fil of
256 Ascendant -> g & phylo_groupPeriodParents .~ pts'
257 Descendant -> g & phylo_groupPeriodChilds .~ pts'
258 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.transposeLinks] Wrong type of filiation")]
259 in trackPointersRec fil (filterWithKey (\k _ -> not $ elem k (keys pts)) m) (tail' "track" gs) res'
260
261
262
263 transposeLinks :: Level -> Filiation -> Phylo -> Phylo
264 transposeLinks lvl fil p =
265 let prdMap = zip (phyloToPeriodMaps (lvl - 1) fil p) (phyloToPeriodMaps lvl fil p)
266 transposed = map (\(gs,gs') ->
267 let idx = fromJust $ elemIndex (gs,gs') prdMap
268 next = take (getPhyloMatchingFrame p) $ snd $ splitAt (idx + 1) prdMap
269 groups = trackPointersRec fil (unions $ map fst next) (elems gs') []
270 in (getGroupPeriod $ head' "transpose" groups ,groups)
271 ) prdMap
272 transposed' = Map.fromList $ (transposed `using` parList rdeepseq)
273 in alterPhyloGroups
274 (\gs -> if ((not . null) gs) && (lvl == (getGroupLevel $ head' "transpose" gs))
275 then transposed' ! (getGroupPeriod $ head' "transpose" gs)
276 else gs
277 ) p
278
279
280
281 -- | Transpose the parent/child pointers from one level to another
282 transposePeriodLinks :: Level -> Phylo -> Phylo
283 transposePeriodLinks lvl p = alterPhyloGroups
284 (\gs -> if ((not . null) gs) && (elem lvl $ map getGroupLevel gs)
285 then
286 let groups = map (\g -> let m = reduceGroups g lvlGroups
287 in g & phylo_groupPeriodParents .~ (trackPointers m $ g ^. phylo_groupPeriodParents)
288 & phylo_groupPeriodChilds .~ (trackPointers m $ g ^. phylo_groupPeriodChilds )) gs
289 groups' = groups `using` parList rdeepseq
290 in groups'
291 else gs
292 ) p
293 where
294 --------------------------------------
295 -- | find an other way to find the group from the id
296 trackPointers :: Map PhyloGroupId PhyloGroup -> [Pointer] -> [Pointer]
297 trackPointers m pts = Map.toList
298 $ fromListWith (\w w' -> max w w')
299 $ map (\(id,_w) -> (getGroupLevelParentId $ m ! id,_w)) pts
300 --------------------------------------
301 reduceGroups :: PhyloGroup -> [PhyloGroup] -> Map PhyloGroupId PhyloGroup
302 reduceGroups g gs = Map.fromList
303 $ map (\g' -> (getGroupId g',g'))
304 $ filter (\g' -> ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))) gs
305 --------------------------------------
306 lvlGroups :: [PhyloGroup]
307 lvlGroups = getGroupsWithLevel (lvl - 1) p
308 --------------------------------------
309
310
311 ----------------
312 -- | Tracer | --
313 ----------------
314
315 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
316 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
317 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
318 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
319 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
320 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
321 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
322
323
324 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
325 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
326 <> "with sizes : " <> show (map length bs) <> "\n") bs
327