]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/LinkMaker.hs
RENAME FIX
[gargantext.git] / src / Gargantext / Viz / Phylo / LinkMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.LinkMaker
18 where
19
20 import Control.Parallel.Strategies
21 import Control.Lens hiding (both, Level)
22 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, delete, intersect, groupBy, union, inits, scanl, find)
23 import Data.Tuple.Extra
24 import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith,member)
25 import Gargantext.Prelude
26 import Gargantext.Viz.Phylo
27 import Gargantext.Viz.Phylo.Tools
28 import Gargantext.Viz.Phylo.Metrics
29 import qualified Data.List as List
30 import qualified Data.Maybe as Maybe
31 import qualified Data.Map as Map
32
33 import qualified Data.Vector.Storable as VS
34 import Debug.Trace (trace)
35 import Numeric.Statistics (percentile)
36
37 -----------------------------
38 -- | From Level to level | --
39 -----------------------------
40
41
42 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
43 linkGroupToGroups :: PhyloGroup -> [PhyloGroup] -> PhyloGroup
44 linkGroupToGroups current targets = over (phylo_groupLevelParents) addPointers current
45 where
46 --------------------------------------
47 addPointers :: [Pointer] -> [Pointer]
48 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
49 if (elem (getGroupId current) (getGroupLevelChildsId target))
50 then Just ((getGroupId target),1)
51 else Nothing) targets
52 --------------------------------------
53
54
55 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
56 setLevelLinks (lvl,lvl') p = alterGroupWithLevel (\group -> linkGroupToGroups group
57 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams group) (getGroupNgrams g'))
58 $ getGroupsWithFilters lvl' (getGroupPeriod group) p) lvl p
59
60
61 -------------------------------
62 -- | From Period to Period | --
63 -------------------------------
64
65
66 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
67 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
68 getNextPeriods to' limit id l = case to' of
69 Descendant -> take limit $ (tail . snd) next
70 Ascendant -> take limit $ (reverse . fst) next
71 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
72 where
73 --------------------------------------
74 next :: ([PhyloPeriodId], [PhyloPeriodId])
75 next = splitAt idx l
76 --------------------------------------
77 idx :: Int
78 idx = case (List.elemIndex id l) of
79 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
80 Just i -> i
81 --------------------------------------
82
83
84 -- | To get the number of docs produced during a list of periods
85 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
86 periodsToNbDocs prds phylo = sum $ elems
87 $ restrictKeys (phylo ^. phylo_docsByYears)
88 $ periodsToYears prds
89
90
91 -- | To process a given Proximity
92 processProximity :: Proximity -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
93 processProximity proximity nbDocs cooc cooc' ngrams ngrams' = case proximity of
94 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
95 Hamming (HammingParams _) -> hamming cooc cooc'
96 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
97
98
99 filterProximity :: Double -> Proximity -> Bool
100 filterProximity score prox = case prox of
101 WeightedLogJaccard (WLJParams thr _) -> score >= thr
102 Hamming (HammingParams thr) -> score <= thr
103 _ -> panic "[ERR][Viz.Phylo.LinkMaker.filterProximity] Unknown proximity"
104
105
106 makePairs :: [(Date,Date)] -> PhyloGroup -> Phylo -> [(PhyloGroup,PhyloGroup)]
107 makePairs prds g p = filter (\pair -> ((last' "makePairs" prds) == (getGroupPeriod $ fst pair))
108 || ((last' "makePairs" prds) == (getGroupPeriod $ snd pair)))
109 $ listToPairs
110 $ filter (\g' -> (elem (getGroupPeriod g') prds)
111 && ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
112 && (((last' "makePairs" prds) == (getGroupPeriod g))
113 ||((matchWithPairs g (g,g') p) >= (getPhyloMatchingFrameTh p))))
114 $ getGroupsWithLevel (getGroupLevel g) p
115
116 matchWithPairs :: PhyloGroup -> (PhyloGroup,PhyloGroup) -> Phylo -> Double
117 matchWithPairs g1 (g2,g3) p =
118 let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] p
119 cooc = if (g2 == g3)
120 then getGroupCooc g2
121 else unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
122 ngrams = if (g2 == g3)
123 then getGroupNgrams g2
124 else union (getGroupNgrams g2) (getGroupNgrams g3)
125 in processProximity (getPhyloProximity p) nbDocs (getGroupCooc g1) cooc (getGroupNgrams g1) ngrams
126
127
128 phyloGroupMatching :: [PhyloPeriodId] -> PhyloGroup -> Phylo -> [Pointer]
129 phyloGroupMatching periods g p = case pointers of
130 Nothing -> []
131 Just pts -> head' "phyloGroupMatching"
132 -- | Keep only the best set of pointers grouped by proximity
133 $ groupBy (\pt pt' -> snd pt == snd pt')
134 $ reverse $ sortOn snd pts
135 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
136 where
137 --------------------------------------
138 pointers :: Maybe [Pointer]
139 pointers = find (not . null)
140 -- | For each time frame, process the Proximity on relevant pairs of targeted groups
141 $ scanl (\acc frame ->
142 let pairs = makePairs frame g p
143 in acc ++ ( filter (\(_,proxi) -> filterProximity proxi (getPhyloProximity p))
144 $ concat
145 $ map (\(t,t') ->
146 let proxi = matchWithPairs g (t,t') p
147 in
148 if (t == t')
149 then [(getGroupId t,proxi)]
150 else [(getGroupId t,proxi),(getGroupId t',proxi)] ) pairs ) ) []
151 -- | [[1900],[1900,1901],[1900,1901,1902],...] | length max => + 5 years
152 $ inits periods
153 --------------------------------------
154
155
156 -- | To add some Pointer to a PhyloGroup
157 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
158 addPointers' fil pts g = g & case fil of
159 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
160 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
161 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
162
163
164
165 -- | To update a list of phyloGroups with some Pointers
166 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
167 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
168 then addPointers' fil (m ! (getGroupId g)) g
169 else g ) gs) p
170
171
172
173 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
174 initCandidates :: PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [PhyloGroup]
175 initCandidates g prds gs = filter (\g' -> elem (getGroupPeriod g') prds)
176 $ filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
177 $ delete g gs
178
179
180 -- | a init avec la [[head groups]] et la tail groups
181 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
182 toBranches mem gs
183 | null gs = mem
184 | otherwise = toBranches mem' $ tail gs
185 where
186 --------------------------------------
187 mem' :: [[PhyloGroup]]
188 mem' = if (null withHead)
189 then mem ++ [[head' "toBranches" gs]]
190 else (filter (\gs' -> not $ elem gs' withHead) mem)
191 ++
192 [(concat withHead) ++ [head' "toBranches" gs]]
193 --------------------------------------
194 withHead :: [[PhyloGroup]]
195 withHead = filter (\gs' -> (not . null)
196 $ intersect (concat $ map getGroupNgrams gs')
197 (getGroupNgrams $ (head' "toBranches" gs))
198 ) mem
199 --------------------------------------
200
201
202 -- | To process an intertemporal matching task to a Phylo at a given level
203 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
204 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
205 -- | 3) update all the groups with the new pointers if they exist
206 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
207 interTempoMatching fil lvl _ p = updateGroups fil lvl (Map.fromList pointers) p
208 where
209 --------------------------------------
210 pointers :: [(PhyloGroupId,[Pointer])]
211 pointers =
212 let pts = map (\g -> let periods = getNextPeriods fil (getPhyloMatchingFrame p) (getGroupPeriod g) (getPhyloPeriods p)
213 in (getGroupId g, phyloGroupMatching periods g p)) groups
214 pts' = pts `using` parList rdeepseq
215 in pts'
216 --------------------------------------
217 groups :: [PhyloGroup]
218 groups = getGroupsWithLevel lvl p
219 --------------------------------------
220
221
222 ------------------------------------------------------------------------
223 -- | Make links from Period to Period after level 1
224
225 -- | Transpose the parent/child pointers from one level to another
226 transposePeriodLinks :: Level -> Phylo -> Phylo
227 transposePeriodLinks lvl p = alterPhyloGroups
228 (\gs -> if ((not . null) gs) && (elem lvl $ map getGroupLevel gs)
229 then
230 let groups = map (\g -> let m = reduceGroups g lvlGroups
231 in g & phylo_groupPeriodParents .~ (trackPointers m $ g ^. phylo_groupPeriodParents)
232 & phylo_groupPeriodChilds .~ (trackPointers m $ g ^. phylo_groupPeriodChilds )) gs
233 groups' = groups `using` parList rdeepseq
234 in groups'
235 else gs
236 ) p
237 where
238 --------------------------------------
239 -- | find an other way to find the group from the id
240 trackPointers :: Map PhyloGroupId PhyloGroup -> [Pointer] -> [Pointer]
241 trackPointers m pts = Map.toList
242 $ fromListWith (\w w' -> max w w')
243 $ map (\(id,_w) -> (getGroupLevelParentId $ m ! id,_w)) pts
244 --------------------------------------
245 reduceGroups :: PhyloGroup -> [PhyloGroup] -> Map PhyloGroupId PhyloGroup
246 reduceGroups g gs = Map.fromList
247 $ map (\g' -> (getGroupId g',g'))
248 $ filter (\g' -> ((not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))) gs
249 --------------------------------------
250 lvlGroups :: [PhyloGroup]
251 lvlGroups = getGroupsWithLevel (lvl - 1) p
252 --------------------------------------
253
254
255 ----------------
256 -- | Tracer | --
257 ----------------
258
259 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
260 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
261 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
262 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
263 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
264 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
265 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
266
267
268 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
269 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
270 <> "with sizes : " <> show (map length bs) <> "\n") bs
271