]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/LinkMaker.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Viz / Phylo / LinkMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13
14 module Gargantext.Viz.Phylo.LinkMaker
15 where
16
17 import Control.Parallel.Strategies
18 import Control.Lens hiding (both, Level)
19 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, delete, intersect, elemIndex, groupBy, union, inits, scanl, find)
20 import Data.Tuple.Extra
21 import Data.Map (Map, (!), fromListWith, elems, restrictKeys, filterWithKey, keys, unionWith, unions, intersectionWith, member, fromList)
22 import Gargantext.Prelude
23 import Gargantext.Viz.Phylo
24 import Gargantext.Viz.Phylo.Tools
25 import Gargantext.Viz.Phylo.Metrics
26 import qualified Data.List as List
27 import qualified Data.Maybe as Maybe
28 import qualified Data.Map as Map
29
30 import qualified Data.Vector.Storable as VS
31 import Debug.Trace (trace)
32 import Numeric.Statistics (percentile)
33
34 -----------------------------
35 -- | From Level to level | --
36 -----------------------------
37
38
39 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
40 linkGroupToGroups :: PhyloGroup -> [PhyloGroup] -> PhyloGroup
41 linkGroupToGroups current targets = over (phylo_groupLevelParents) addPointers current
42 where
43 --------------------------------------
44 addPointers :: [Pointer] -> [Pointer]
45 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
46 if (elem (getGroupId current) (getGroupLevelChildsId target))
47 then Just ((getGroupId target),1)
48 else Nothing) targets
49 --------------------------------------
50
51
52 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
53 setLevelLinks (lvl,lvl') p = alterGroupWithLevel (\group -> linkGroupToGroups group
54 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams group) (getGroupNgrams g'))
55 $ getGroupsWithFilters lvl' (getGroupPeriod group) p) lvl p
56
57
58 -------------------------------
59 -- | From Period to Period | --
60 -------------------------------
61
62
63 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
64 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
65 getNextPeriods to' limit id l = case to' of
66 Descendant -> take limit $ (tail . snd) next
67 Ascendant -> take limit $ (reverse . fst) next
68 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
69 where
70 --------------------------------------
71 next :: ([PhyloPeriodId], [PhyloPeriodId])
72 next = splitAt idx l
73 --------------------------------------
74 idx :: Int
75 idx = case (List.elemIndex id l) of
76 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
77 Just i -> i
78 --------------------------------------
79
80
81 -- | To get the number of docs produced during a list of periods
82 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
83 periodsToNbDocs prds phylo = sum $ elems
84 $ restrictKeys (phylo ^. phylo_docsByYears)
85 $ periodsToYears prds
86
87
88 -- | To process a given Proximity
89 processProximity :: Proximity -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
90 processProximity proximity nbDocs cooc cooc' ngrams ngrams' = case proximity of
91 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
92 Hamming (HammingParams _) -> hamming cooc cooc'
93 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
94
95
96 filterProximity :: Double -> Proximity -> Bool
97 filterProximity score prox = case prox of
98 WeightedLogJaccard (WLJParams thr _) -> score >= thr
99 Hamming (HammingParams thr) -> score <= thr
100 _ -> panic "[ERR][Viz.Phylo.LinkMaker.filterProximity] Unknown proximity"
101
102
103 makePairs :: [(Date,Date)] -> PhyloGroup -> Phylo -> [(PhyloGroup,PhyloGroup)]
104 makePairs prds g p = filter (\pair -> ((last' "makePairs" prds) == (getGroupPeriod $ fst pair))
105 || ((last' "makePairs" prds) == (getGroupPeriod $ snd pair)))
106 $ listToPairs
107 $ filter (\g' -> (elem (getGroupPeriod g') prds)
108 && ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
109 && (((last' "makePairs" prds) == (getGroupPeriod g))
110 ||((matchWithPairs g (g,g') p) >= (getPhyloMatchingFrameTh p))))
111 $ getGroupsWithLevel (getGroupLevel g) p
112
113 matchWithPairs :: PhyloGroup -> (PhyloGroup,PhyloGroup) -> Phylo -> Double
114 matchWithPairs g1 (g2,g3) p =
115 let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] p
116 cooc = if (g2 == g3)
117 then getGroupCooc g2
118 else unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
119 ngrams = if (g2 == g3)
120 then getGroupNgrams g2
121 else union (getGroupNgrams g2) (getGroupNgrams g3)
122 in processProximity (getPhyloProximity p) nbDocs (getGroupCooc g1) cooc (getGroupNgrams g1) ngrams
123
124
125 phyloGroupMatching :: [PhyloPeriodId] -> PhyloGroup -> Phylo -> [Pointer]
126 phyloGroupMatching periods g p = case pointers of
127 Nothing -> []
128 Just pts -> head' "phyloGroupMatching"
129 -- Keep only the best set of pointers grouped by proximity
130 $ groupBy (\pt pt' -> snd pt == snd pt')
131 $ reverse $ sortOn snd pts
132 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
133 where
134 --------------------------------------
135 pointers :: Maybe [Pointer]
136 pointers = find (not . null)
137 -- For each time frame, process the Proximity on relevant pairs of targeted groups
138 $ scanl (\acc frame ->
139 let pairs = makePairs frame g p
140 in acc ++ ( filter (\(_,proxi) -> filterProximity proxi (getPhyloProximity p))
141 $ concat
142 $ map (\(t,t') ->
143 let proxi = matchWithPairs g (t,t') p
144 in
145 if (t == t')
146 then [(getGroupId t,proxi)]
147 else [(getGroupId t,proxi),(getGroupId t',proxi)] ) pairs ) ) []
148 -- [[1900],[1900,1901],[1900,1901,1902],...] | length max => + 5 years
149 $ inits periods
150 --------------------------------------
151
152
153 -- | To add some Pointer to a PhyloGroup
154 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
155 addPointers' fil pts g = g & case fil of
156 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
157 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
158 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
159
160
161
162 -- | To update a list of phyloGroups with some Pointers
163 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
164 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
165 then addPointers' fil (m ! (getGroupId g)) g
166 else g ) gs) p
167
168
169
170 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
171 initCandidates :: PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [PhyloGroup]
172 initCandidates g prds gs = filter (\g' -> elem (getGroupPeriod g') prds)
173 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
174 $ delete g gs
175
176
177 -- | a init avec la [[head groups]] et la tail groups
178 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
179 toBranches mem gs
180 | null gs = mem
181 | otherwise = toBranches mem' $ tail gs
182 where
183 --------------------------------------
184 mem' :: [[PhyloGroup]]
185 mem' = if (null withHead)
186 then mem ++ [[head' "toBranches" gs]]
187 else (filter (\gs' -> not $ elem gs' withHead) mem)
188 ++
189 [(concat withHead) ++ [head' "toBranches" gs]]
190 --------------------------------------
191 withHead :: [[PhyloGroup]]
192 withHead = filter (\gs' -> (not . null)
193 $ intersect (concat $ map getGroupNgrams gs')
194 (getGroupNgrams $ (head' "toBranches" gs))
195 ) mem
196 --------------------------------------
197
198
199 -- | To process an intertemporal matching task to a Phylo at a given level
200 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
201 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
202 -- | 3) update all the groups with the new pointers if they exist
203 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
204 interTempoMatching fil lvl _ p = updateGroups fil lvl (Map.fromList pointers) p
205 where
206 --------------------------------------
207 pointers :: [(PhyloGroupId,[Pointer])]
208 pointers =
209 let pts = map (\g -> let periods = getNextPeriods fil (getPhyloMatchingFrame p) (getGroupPeriod g) (getPhyloPeriods p)
210 in (getGroupId g, phyloGroupMatching periods g p)) groups
211 pts' = pts `using` parList rdeepseq
212 in pts'
213 --------------------------------------
214 groups :: [PhyloGroup]
215 groups = getGroupsWithLevel lvl p
216 --------------------------------------
217
218
219 ------------------------------------------------------------------------
220 -- | Make links from Period to Period after level 1
221 listToTuple :: (a -> b) -> [a] -> [(b,a)]
222 listToTuple f l = map (\x -> (f x, x)) l
223
224
225 groupsToMaps :: Ord b => (PhyloGroup -> b) -> [PhyloGroup] -> [Map PhyloGroupId PhyloGroup]
226 groupsToMaps f gs = map (\gs' -> fromList $ listToTuple getGroupId gs')
227 $ groupBy ((==) `on` f)
228 $ sortOn f gs
229
230
231 phyloToPeriodMaps :: Level -> Filiation -> Phylo -> [Map PhyloGroupId PhyloGroup]
232 phyloToPeriodMaps lvl fil p =
233 let prdMap = groupsToMaps (fst . getGroupPeriod) (getGroupsWithLevel lvl p)
234 in case fil of
235 Ascendant -> reverse prdMap
236 Descendant -> prdMap
237 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.phyloToPeriodMaps] Wrong type of filiation")
238
239
240 trackPointersRec :: Filiation -> Map PhyloGroupId PhyloGroup -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
241 trackPointersRec fil m gs res =
242 if (null gs) then res
243 else if (Map.null m) then res ++ gs
244 else
245 let g = head' "track" gs
246 pts = Map.fromList $ getGroupPointers PeriodEdge fil g
247 pts' = Map.toList $ fromListWith (\w w' -> max w w') $ concat $ elems
248 $ intersectionWith (\w g' -> map (\(id,_w') -> (id, w))
249 $ getGroupPointers LevelEdge Ascendant g') pts m
250 res' = res ++ [case fil of
251 Ascendant -> g & phylo_groupPeriodParents .~ pts'
252 Descendant -> g & phylo_groupPeriodChilds .~ pts'
253 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.transposeLinks] Wrong type of filiation")]
254 in trackPointersRec fil (filterWithKey (\k _ -> not $ elem k (keys pts)) m) (tail' "track" gs) res'
255
256
257
258 transposeLinks :: Level -> Filiation -> Phylo -> Phylo
259 transposeLinks lvl fil p =
260 let prdMap = zip (phyloToPeriodMaps (lvl - 1) fil p) (phyloToPeriodMaps lvl fil p)
261 transposed = map (\(gs,gs') ->
262 let idx = fromJust $ elemIndex (gs,gs') prdMap
263 next = take (getPhyloMatchingFrame p) $ snd $ splitAt (idx + 1) prdMap
264 groups = trackPointersRec fil (unions $ map fst next) (elems gs') []
265 in (getGroupPeriod $ head' "transpose" groups ,groups)
266 ) prdMap
267 transposed' = Map.fromList $ (transposed `using` parList rdeepseq)
268 in alterPhyloGroups
269 (\gs -> if ((not . null) gs) && (lvl == (getGroupLevel $ head' "transpose" gs))
270 then transposed' ! (getGroupPeriod $ head' "transpose" gs)
271 else gs
272 ) p
273
274
275
276 -- | Transpose the parent/child pointers from one level to another
277 transposePeriodLinks :: Level -> Phylo -> Phylo
278 transposePeriodLinks lvl p = alterPhyloGroups
279 (\gs -> if ((not . null) gs) && (elem lvl $ map getGroupLevel gs)
280 then
281 let groups = map (\g -> let m = reduceGroups g lvlGroups
282 in g & phylo_groupPeriodParents .~ (trackPointers m $ g ^. phylo_groupPeriodParents)
283 & phylo_groupPeriodChilds .~ (trackPointers m $ g ^. phylo_groupPeriodChilds )) gs
284 groups' = groups `using` parList rdeepseq
285 in groups'
286 else gs
287 ) p
288 where
289 --------------------------------------
290 -- | find an other way to find the group from the id
291 trackPointers :: Map PhyloGroupId PhyloGroup -> [Pointer] -> [Pointer]
292 trackPointers m pts = Map.toList
293 $ fromListWith (\w w' -> max w w')
294 $ map (\(id,_w) -> (getGroupLevelParentId $ m ! id,_w)) pts
295 --------------------------------------
296 reduceGroups :: PhyloGroup -> [PhyloGroup] -> Map PhyloGroupId PhyloGroup
297 reduceGroups g gs = Map.fromList
298 $ map (\g' -> (getGroupId g',g'))
299 $ filter (\g' -> ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))) gs
300 --------------------------------------
301 lvlGroups :: [PhyloGroup]
302 lvlGroups = getGroupsWithLevel (lvl - 1) p
303 --------------------------------------
304
305
306 ----------------
307 -- | Tracer | --
308 ----------------
309
310 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
311 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
312 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
313 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
314 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
315 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
316 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
317
318
319 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
320 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
321 <> "with sizes : " <> show (map length bs) <> "\n") bs
322