]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloExport.hs
[upload zip] some more work on zipfile parsing
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloExport.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloExport
3 Description : Exportation module of a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE TypeSynonymInstances #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloExport where
14
15 import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList, member)
16 import Data.List ((++), sort, nub, null, concat, sortOn, groupBy, union, (\\), (!!), init, partition, notElem, unwords, nubBy, inits, elemIndex)
17 import Data.Vector (Vector)
18
19 import Prelude (writeFile)
20 import Gargantext.Prelude
21 import Gargantext.Core.Viz.AdaptativePhylo
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (filterDocs, filterDiago, reduceDiagos, toProximity, getNextPeriods)
24
25 import Control.Lens hiding (Level)
26 import Control.Parallel.Strategies (parList, rdeepseq, using)
27 import Data.GraphViz hiding (DotGraph, Order)
28 import Data.GraphViz.Types.Generalised (DotGraph)
29 import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
30 import Data.GraphViz.Types.Monadic
31 import Data.Text.Lazy (fromStrict, pack, unpack)
32 import System.FilePath
33 import Debug.Trace (trace)
34
35 import qualified Data.Text as Text
36 import qualified Data.Vector as Vector
37 import qualified Data.Text.Lazy as Lazy
38 import qualified Data.GraphViz.Attributes.HTML as H
39
40 --------------------
41 -- | Dot export | --
42 --------------------
43
44 dotToFile :: FilePath -> DotGraph DotId -> IO ()
45 dotToFile filePath dotG = writeFile filePath $ dotToString dotG
46
47 dotToString :: DotGraph DotId -> [Char]
48 dotToString dotG = unpack (printDotGraph dotG)
49
50 dynamicToColor :: Double -> H.Attribute
51 dynamicToColor d
52 | d == 0 = H.BGColor (toColor LightCoral)
53 | d == 1 = H.BGColor (toColor Khaki)
54 | d == 2 = H.BGColor (toColor SkyBlue)
55 | otherwise = H.Color (toColor Black)
56
57 pickLabelColor :: [Double] -> H.Attribute
58 pickLabelColor lst
59 | elem 0 lst = dynamicToColor 0
60 | elem 2 lst = dynamicToColor 2
61 | elem 1 lst = dynamicToColor 1
62 | otherwise = dynamicToColor 3
63
64 toDotLabel :: Text.Text -> Label
65 toDotLabel lbl = StrLabel $ fromStrict lbl
66
67 toAttr :: AttributeName -> Lazy.Text -> CustomAttribute
68 toAttr k v = customAttribute k v
69
70 metaToAttr :: Map Text.Text [Double] -> [CustomAttribute]
71 metaToAttr meta = map (\(k,v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
72
73 groupIdToDotId :: PhyloGroupId -> DotId
74 groupIdToDotId (((d,d'),lvl),idx) = (fromStrict . Text.pack) $ ("group" <> (show d) <> (show d') <> (show lvl) <> (show idx))
75
76 branchIdToDotId :: PhyloBranchId -> DotId
77 branchIdToDotId bId = (fromStrict . Text.pack) $ ("branch" <> show (snd bId))
78
79 periodIdToDotId :: PhyloPeriodId -> DotId
80 periodIdToDotId prd = (fromStrict . Text.pack) $ ("period" <> show (fst prd) <> show (snd prd))
81
82 groupToTable :: Vector Ngrams -> PhyloGroup -> H.Label
83 groupToTable fdt g = H.Table H.HTable
84 { H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
85 , H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
86 , H.tableRows = [header]
87 <> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
88 <> ( map ngramsToRow $ splitEvery 4
89 $ reverse $ sortOn (snd . snd)
90 $ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
91 $ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
92 where
93 --------------------------------------
94 ngramsToRow :: [(Ngrams,(Double,Double))] -> H.Row
95 ngramsToRow ns = H.Cells $ map (\(n,(d,_)) ->
96 H.LabelCell [H.Align H.HLeft,dynamicToColor d] $ H.Text [H.Str $ fromStrict n]) ns
97 --------------------------------------
98 header :: H.Row
99 header =
100 H.Cells [ H.LabelCell [pickLabelColor ((g ^. phylo_groupMeta) ! "dynamics")]
101 $ H.Text [H.Str $ (((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
102 <> (fromStrict " ( ")
103 <> (pack $ show (fst $ g ^. phylo_groupPeriod))
104 <> (fromStrict " , ")
105 <> (pack $ show (snd $ g ^. phylo_groupPeriod))
106 <> (fromStrict " ) ")
107 <> (pack $ show (getGroupId g)))]]
108 --------------------------------------
109
110 branchToDotNode :: PhyloBranch -> Int -> Dot DotId
111 branchToDotNode b bId =
112 node (branchIdToDotId $ b ^. branch_id)
113 ([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ b ^. branch_label)]
114 <> (metaToAttr $ b ^. branch_meta)
115 <> [ toAttr "nodeType" "branch"
116 , toAttr "bId" (pack $ show bId)
117 , toAttr "branchId" (pack $ unwords (map show $ snd $ b ^. branch_id))
118 , toAttr "branch_x" (fromStrict $ Text.pack $ (show $ b ^. branch_x))
119 , toAttr "branch_y" (fromStrict $ Text.pack $ (show $ b ^. branch_y))
120 , toAttr "label" (pack $ show $ b ^. branch_label)
121 ])
122
123 periodToDotNode :: (Date,Date) -> (Text.Text,Text.Text) -> Dot DotId
124 periodToDotNode prd prd' =
125 node (periodIdToDotId prd)
126 ([Shape BoxShape, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
127 <> [ toAttr "nodeType" "period"
128 , toAttr "strFrom" (fromStrict $ Text.pack $ (show $ fst prd'))
129 , toAttr "strTo" (fromStrict $ Text.pack $ (show $ snd prd'))
130 , toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
131 , toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
132
133
134 groupToDotNode :: Vector Ngrams -> PhyloGroup -> Int -> Dot DotId
135 groupToDotNode fdt g bId =
136 node (groupIdToDotId $ getGroupId g)
137 ([FontName "Arial", Shape Square, penWidth 4, toLabel (groupToTable fdt g)]
138 <> [ toAttr "nodeType" "group"
139 , toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
140 , toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
141 , toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
142 , toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
143 , toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
144 , toAttr "bId" (pack $ show bId)
145 , toAttr "support" (pack $ show (g ^. phylo_groupSupport))
146 , toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
147 , toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
148 , toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
149 , toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
150 , toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
151 , toAttr "frequence" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "frequence")))
152 ])
153
154
155 toDotEdge :: DotId -> DotId -> [Char] -> EdgeType -> Dot DotId
156 toDotEdge source target lbl edgeType = edge source target
157 (case edgeType of
158 GroupToGroup -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True] <> [toAttr "edgeType" "link", toAttr "lbl" (pack lbl)]
159 BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])] <> [toAttr "edgeType" "branchLink" ]
160 BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])]
161 GroupToAncestor -> [ Width 3, Color [toWColor Red], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,NoArrow)]), PenWidth 4] <> [toAttr "edgeType" "ancestorLink", toAttr "lbl" (pack lbl)]
162 PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
163
164
165 mergePointers :: [PhyloGroup] -> Map (PhyloGroupId,PhyloGroupId) Double
166 mergePointers groups =
167 let toChilds = fromList $ concat $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupPeriodChilds) groups
168 toParents = fromList $ concat $ map (\g -> map (\(target,w) -> ((target,getGroupId g),w)) $ g ^. phylo_groupPeriodParents) groups
169 in unionWith (\w w' -> max w w') toChilds toParents
170
171 mergeAncestors :: [PhyloGroup] -> [((PhyloGroupId,PhyloGroupId), Double)]
172 mergeAncestors groups = concat
173 $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupAncestors)
174 $ filter (\g -> (not . null) $ g ^. phylo_groupAncestors) groups
175
176
177 toBid :: PhyloGroup -> [PhyloBranch] -> Int
178 toBid g bs =
179 let b' = head' "toBid" (filter (\b -> b ^. branch_id == g ^. phylo_groupBranchId) bs)
180 in fromJust $ elemIndex b' bs
181
182 exportToDot :: Phylo -> PhyloExport -> DotGraph DotId
183 exportToDot phylo export =
184 trace ("\n-- | Convert " <> show(length $ export ^. export_branches) <> " branches and "
185 <> show(length $ export ^. export_groups) <> " groups "
186 <> show(length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups) <> " terms to a dot file\n\n"
187 <> "##########################") $
188 digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do
189
190 {- 1) init the dot graph -}
191 graphAttrs ( [ Label (toDotLabel $ (phyloName $ getConfig phylo))]
192 <> [ FontSize 30, LabelLoc VTop, NodeSep 1, RankSep [1], Rank SameRank, Splines SplineEdges, Overlap ScaleOverlaps
193 , Ratio FillRatio
194 , Style [SItem Filled []],Color [toWColor White]]
195 {-- home made attributes -}
196 <> [(toAttr (fromStrict "phyloFoundations") $ pack $ show (length $ Vector.toList $ getRoots phylo))
197 ,(toAttr (fromStrict "phyloTerms") $ pack $ show (length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups))
198 ,(toAttr (fromStrict "phyloDocs") $ pack $ show (sum $ elems $ phylo ^. phylo_timeDocs))
199 ,(toAttr (fromStrict "phyloPeriods") $ pack $ show (length $ elems $ phylo ^. phylo_periods))
200 ,(toAttr (fromStrict "phyloBranches") $ pack $ show (length $ export ^. export_branches))
201 ,(toAttr (fromStrict "phyloGroups") $ pack $ show (length $ export ^. export_groups))
202 ,(toAttr (fromStrict "phyloSources") $ pack $ show (Vector.toList $ getSources phylo))
203 ,(toAttr (fromStrict "phyloTimeScale") $ pack $ getTimeScale phylo)
204 -- ,(toAttr (fromStrict "phyloTermsFreq") $ pack $ show (toList $ _phylo_lastTermFreq phylo))
205 ])
206
207 {-
208 -- toAttr (fromStrict k) $ (pack . unwords) $ map show v
209
210 -- 2) create a layer for the branches labels -}
211 subgraph (Str "Branches peaks") $ do
212
213 -- graphAttrs [Rank SameRank]
214 {-
215 -- 3) group the branches by hierarchy
216 -- mapM (\branches ->
217 -- subgraph (Str "Branches clade") $ do
218 -- graphAttrs [Rank SameRank]
219
220 -- -- 4) create a node for each branch
221 -- mapM branchToDotNode branches
222 -- ) $ elems $ fromListWith (++) $ map (\b -> ((init . snd) $ b ^. branch_id,[b])) $ export ^. export_branches
223 -}
224 mapM (\b -> branchToDotNode b (fromJust $ elemIndex b (export ^. export_branches))) $ export ^. export_branches
225
226 {-- 5) create a layer for each period -}
227 _ <- mapM (\period ->
228 subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst $ _phylo_periodPeriod period) <> show (snd $ _phylo_periodPeriod period))) $ do
229 graphAttrs [Rank SameRank]
230 periodToDotNode (period ^. phylo_periodPeriod) (period ^. phylo_periodPeriod')
231
232 {-- 6) create a node for each group -}
233 mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == (period ^. phylo_periodPeriod)) $ export ^. export_groups)
234 ) $ phylo ^. phylo_periods
235
236 {-- 7) create the edges between a branch and its first groups -}
237 _ <- mapM (\(bId,groups) ->
238 mapM (\g -> toDotEdge (branchIdToDotId bId) (groupIdToDotId $ getGroupId g) "" BranchToGroup) groups
239 )
240 $ toList
241 $ map (\groups -> head' "toDot"
242 $ groupBy (\g g' -> g' ^. phylo_groupPeriod == g ^. phylo_groupPeriod)
243 $ sortOn (fst . _phylo_groupPeriod) groups)
244 $ fromListWith (++) $ map (\g -> (g ^. phylo_groupBranchId,[g])) $ export ^. export_groups
245
246 {- 8) create the edges between the groups -}
247 _ <- mapM (\((k,k'),v) ->
248 toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToGroup
249 ) $ (toList . mergePointers) $ export ^. export_groups
250
251 _ <- mapM (\((k,k'),v) ->
252 toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToAncestor
253 ) $ mergeAncestors $ export ^. export_groups
254
255 -- 10) create the edges between the periods
256 _ <- mapM (\(prd,prd') ->
257 toDotEdge (periodIdToDotId prd) (periodIdToDotId prd') "" PeriodToPeriod
258 ) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ getPeriodIds phylo
259
260 {- 8) create the edges between the branches
261 -- _ <- mapM (\(bId,bId') ->
262 -- toDotEdge (branchIdToDotId bId) (branchIdToDotId bId')
263 -- (Text.pack $ show(branchIdsToProximity bId bId'
264 -- (getThresholdInit $ phyloProximity $ getConfig phylo)
265 -- (getThresholdStep $ phyloProximity $ getConfig phylo))) BranchToBranch
266 -- ) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ map _branch_id $ export ^. export_branches
267 -}
268
269
270 graphAttrs [Rank SameRank]
271
272
273 ----------------
274 -- | Filter | --
275 ----------------
276
277 filterByBranchSize :: Double -> PhyloExport -> PhyloExport
278 filterByBranchSize thr export =
279 let splited = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
280 in export & export_branches .~ (fst splited)
281 & export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd splited)))
282
283
284 processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
285 processFilters filters qua export =
286 foldl (\export' f -> case f of
287 ByBranchSize thr -> if (thr < (fromIntegral $ qua ^. qua_minBranch))
288 then filterByBranchSize (fromIntegral $ qua ^. qua_minBranch) export'
289 else filterByBranchSize thr export'
290 ) export filters
291
292 --------------
293 -- | Sort | --
294 --------------
295
296 branchToIso :: [PhyloBranch] -> [PhyloBranch]
297 branchToIso branches =
298 let steps = map sum
299 $ inits
300 $ map (\(b,x) -> b ^. branch_y + 0.05 - x)
301 $ zip branches
302 $ ([0] ++ (map (\(b,b') ->
303 let idx = length $ commonPrefix (b ^. branch_canonId) (b' ^. branch_canonId) []
304 lmin = min (length $ b ^. branch_seaLevel) (length $ b' ^. branch_seaLevel)
305 in
306 if ((idx - 1) > ((length $ b' ^. branch_seaLevel) - 1))
307 then (b' ^. branch_seaLevel) !! (lmin - 1)
308 else (b' ^. branch_seaLevel) !! (idx - 1)
309 ) $ listToSeq branches))
310 in map (\(x,b) -> b & branch_x .~ x)
311 $ zip steps branches
312
313 branchToIso' :: Double -> Double -> [PhyloBranch] -> [PhyloBranch]
314 branchToIso' start step branches =
315 let bx = map (\l -> (sum l) + ((fromIntegral $ length l) * 0.5))
316 $ inits
317 $ ([0] ++ (map (\(b,b') ->
318 let root = fromIntegral $ length $ commonPrefix (snd $ b ^. branch_id) (snd $ b' ^. branch_id) []
319 in 1 - start - step * root) $ listToSeq branches))
320 in map (\(x,b) -> b & branch_x .~ x)
321 $ zip bx branches
322
323
324 sortByHierarchy :: Int -> [PhyloBranch] -> [PhyloBranch]
325 sortByHierarchy depth branches =
326 if (length branches == 1)
327 then branches
328 else concat
329 $ map (\branches' ->
330 let partitions = partition (\b -> depth + 1 == ((length . snd) $ b ^. branch_id)) branches'
331 in (sortOn (\b -> (b ^. branch_meta) ! "birth") (fst partitions))
332 ++ (sortByHierarchy (depth + 1) (snd partitions)))
333 $ groupBy (\b b' -> ((take depth . snd) $ b ^. branch_id) == ((take depth . snd) $ b' ^. branch_id) )
334 $ sortOn (\b -> (take depth . snd) $ b ^. branch_id) branches
335
336
337 sortByBirthDate :: Order -> PhyloExport -> PhyloExport
338 sortByBirthDate order export =
339 let branches = sortOn (\b -> (b ^. branch_meta) ! "birth") $ export ^. export_branches
340 branches' = case order of
341 Asc -> branches
342 Desc -> reverse branches
343 in export & export_branches .~ branches'
344
345 processSort :: Sort -> SeaElevation -> PhyloExport -> PhyloExport
346 processSort sort' elev export = case sort' of
347 ByBirthDate o -> sortByBirthDate o export
348 ByHierarchy -> export & export_branches .~ (branchToIso' (_cons_start elev) (_cons_step elev)
349 $ sortByHierarchy 0 (export ^. export_branches))
350
351
352 -----------------
353 -- | Metrics | --
354 -----------------
355
356 -- | Return the conditional probability of i knowing j
357 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
358 conditional m i j = (findWithDefault 0 (i,j) m)
359 / (m ! (j,j))
360
361
362 -- | Return the genericity score of a given ngram
363 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
364 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
365 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
366
367
368 -- | Return the specificity score of a given ngram
369 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
370 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
371 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
372
373
374 -- | Return the inclusion score of a given ngram
375 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
376 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
377 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
378
379
380 ngramsMetrics :: Phylo -> PhyloExport -> PhyloExport
381 ngramsMetrics phylo export =
382 over ( export_groups
383 . traverse )
384 (\g -> g & phylo_groupMeta %~ insert "genericity"
385 (map (\n -> genericity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
386 & phylo_groupMeta %~ insert "specificity"
387 (map (\n -> specificity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
388 & phylo_groupMeta %~ insert "inclusion"
389 (map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
390 & phylo_groupMeta %~ insert "frequence"
391 (map (\n -> getInMap n (phylo ^. phylo_lastTermFreq)) $ g ^. phylo_groupNgrams)
392 ) export
393
394
395 branchDating :: PhyloExport -> PhyloExport
396 branchDating export =
397 over ( export_branches
398 . traverse )
399 (\b ->
400 let groups = sortOn fst
401 $ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id)
402 then acc ++ [g ^. phylo_groupPeriod]
403 else acc ) [] $ export ^. export_groups
404 periods = nub groups
405 birth = fst $ head' "birth" groups
406 age = (snd $ last' "age" groups) - birth
407 in b & branch_meta %~ insert "birth" [fromIntegral birth]
408 & branch_meta %~ insert "age" [fromIntegral age]
409 & branch_meta %~ insert "size" [fromIntegral $ length periods] ) export
410
411 processMetrics :: Phylo -> PhyloExport -> PhyloExport
412 processMetrics phylo export = ngramsMetrics phylo
413 $ branchDating export
414
415
416 -----------------
417 -- | Taggers | --
418 -----------------
419
420 nk :: Int -> [[Int]] -> Int
421 nk n groups = sum
422 $ map (\g -> if (elem n g)
423 then 1
424 else 0) groups
425
426
427 tf :: Int -> [[Int]] -> Double
428 tf n groups = (fromIntegral $ nk n groups) / (fromIntegral $ length $ concat groups)
429
430
431 idf :: Int -> [[Int]] -> Double
432 idf n groups = log ((fromIntegral $ length groups) / (fromIntegral $ nk n groups))
433
434
435 findTfIdf :: [[Int]] -> [(Int,Double)]
436 findTfIdf groups = reverse $ sortOn snd $ map (\n -> (n,(tf n groups) * (idf n groups))) $ sort $ nub $ concat groups
437
438
439 findEmergences :: [PhyloGroup] -> Map Int Double -> [(Int,Double)]
440 findEmergences groups freq =
441 let ngrams = map _phylo_groupNgrams groups
442 dynamics = map (\g -> (g ^. phylo_groupMeta) ! "dynamics") groups
443 emerging = nubBy (\n1 n2 -> fst n1 == fst n2)
444 $ concat $ map (\g -> filter (\(_,d) -> d == 0) $ zip (fst g) (snd g)) $ zip ngrams dynamics
445 in reverse $ sortOn snd
446 $ map (\(n,_) -> if (member n freq)
447 then (n,freq ! n)
448 else (n,0)) emerging
449
450
451 mostEmergentTfIdf :: Int -> Map Int Double -> Vector Ngrams -> PhyloExport -> PhyloExport
452 mostEmergentTfIdf nth freq foundations export =
453 over ( export_branches
454 . traverse )
455 (\b ->
456 let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
457 tfidf = findTfIdf (map _phylo_groupNgrams groups)
458 emergences = findEmergences groups freq
459 selected = if (null emergences)
460 then map fst $ take nth tfidf
461 else [fst $ head' "mostEmergentTfIdf" emergences]
462 ++ (map fst $ take (nth - 1) $ filter (\(n,_) -> n /= (fst $ head' "mostEmergentTfIdf" emergences)) tfidf)
463 in b & branch_label .~ (ngramsToLabel foundations selected)) export
464
465
466 getNthMostMeta :: Int -> [Double] -> [Int] -> [Int]
467 getNthMostMeta nth meta ns = map (\(idx,_) -> (ns !! idx))
468 $ take nth
469 $ reverse
470 $ sortOn snd $ zip [0..] meta
471
472
473 mostInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
474 mostInclusive nth foundations export =
475 over ( export_branches
476 . traverse )
477 (\b ->
478 let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
479 cooc = foldl (\acc g -> unionWith (+) acc (g ^. phylo_groupCooc)) empty groups
480 ngrams = sort $ foldl (\acc g -> union acc (g ^. phylo_groupNgrams)) [] groups
481 inc = map (\n -> inclusion cooc (ngrams \\ [n]) n) ngrams
482 lbl = ngramsToLabel foundations $ getNthMostMeta nth inc ngrams
483 in b & branch_label .~ lbl ) export
484
485
486 mostEmergentInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
487 mostEmergentInclusive nth foundations export =
488 over ( export_groups
489 . traverse )
490 (\g ->
491 let lbl = ngramsToLabel foundations
492 $ take nth
493 $ map (\(_,(_,idx)) -> idx)
494 $ concat
495 $ map (\groups -> sortOn (fst . snd) groups)
496 $ groupBy ((==) `on` fst) $ reverse $ sortOn fst
497 $ zip ((g ^. phylo_groupMeta) ! "inclusion")
498 $ zip ((g ^. phylo_groupMeta) ! "dynamics") (g ^. phylo_groupNgrams)
499 in g & phylo_groupLabel .~ lbl ) export
500
501
502 processLabels :: [PhyloLabel] -> Vector Ngrams -> Map Int Double -> PhyloExport -> PhyloExport
503 processLabels labels foundations freq export =
504 foldl (\export' label ->
505 case label of
506 GroupLabel tagger nth ->
507 case tagger of
508 MostEmergentInclusive -> mostEmergentInclusive nth foundations export'
509 _ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger"
510 BranchLabel tagger nth ->
511 case tagger of
512 MostInclusive -> mostInclusive nth foundations export'
513 MostEmergentTfIdf -> mostEmergentTfIdf nth freq foundations export'
514 _ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger" ) export labels
515
516
517 ------------------
518 -- | Dynamics | --
519 ------------------
520
521
522 toDynamics :: Int -> [PhyloGroup] -> PhyloGroup -> Map Int (Date,Date) -> Double
523 toDynamics n parents g m =
524 let prd = g ^. phylo_groupPeriod
525 end = last' "dynamics" (sort $ map snd $ elems m)
526 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
527 {- decrease -}
528 then 2
529 else if ((fst prd) == (fst $ m ! n))
530 {- emerging -}
531 then 0
532 else if isNew
533 {- emergence -}
534 then 1
535 else 3
536 where
537 --------------------------------------
538 isNew :: Bool
539 isNew = not $ elem n $ concat $ map _phylo_groupNgrams parents
540
541
542 processDynamics :: [PhyloGroup] -> [PhyloGroup]
543 processDynamics groups =
544 map (\g ->
545 let parents = filter (\g' -> (g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
546 && ((fst $ g ^. phylo_groupPeriod) > (fst $ g' ^. phylo_groupPeriod))) groups
547 in g & phylo_groupMeta %~ insert "dynamics" (map (\n -> toDynamics n parents g mapNgrams) $ g ^. phylo_groupNgrams) ) groups
548 where
549 --------------------------------------
550 mapNgrams :: Map Int (Date,Date)
551 mapNgrams = map (\dates ->
552 let dates' = sort dates
553 in (head' "dynamics" dates', last' "dynamics" dates'))
554 $ fromListWith (++)
555 $ foldl (\acc g -> acc ++ ( map (\n -> (n,[fst $ g ^. phylo_groupPeriod, snd $ g ^. phylo_groupPeriod]))
556 $ (g ^. phylo_groupNgrams))) [] groups
557
558
559 -----------------
560 -- | horizon | --
561 -----------------
562
563 getGroupThr :: Double -> PhyloGroup -> Double
564 getGroupThr step g =
565 let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
566 breaks = (g ^. phylo_groupMeta) ! "breaks"
567 in (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl)) - step
568
569 toAncestor :: Double -> Map Int Double -> Proximity -> Double -> [PhyloGroup] -> PhyloGroup -> PhyloGroup
570 toAncestor nbDocs diago proximity step candidates ego =
571 let curr = ego ^. phylo_groupAncestors
572 in ego & phylo_groupAncestors .~ (curr ++ (map (\(g,w) -> (getGroupId g,w))
573 $ filter (\(g,w) -> (w > 0) && (w >= (min (getGroupThr step ego) (getGroupThr step g))))
574 $ map (\g -> (g, toProximity nbDocs diago proximity (ego ^. phylo_groupNgrams) (g ^. phylo_groupNgrams) (g ^. phylo_groupNgrams)))
575 $ filter (\g -> g ^. phylo_groupBranchId /= ego ^. phylo_groupBranchId ) candidates))
576
577
578 headsToAncestors :: Double -> Map Int Double -> Proximity -> Double -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
579 headsToAncestors nbDocs diago proximity step heads acc =
580 if (null heads)
581 then acc
582 else
583 let ego = head' "headsToAncestors" heads
584 heads' = tail' "headsToAncestors" heads
585 in headsToAncestors nbDocs diago proximity step heads' (acc ++ [toAncestor nbDocs diago proximity step heads' ego])
586
587
588 toHorizon :: Phylo -> Phylo
589 toHorizon phylo =
590 let phyloAncestor = updatePhyloGroups
591 level
592 (fromList $ map (\g -> (getGroupId g, g))
593 $ concat
594 $ tracePhyloAncestors newGroups) phylo
595 reBranched = fromList $ map (\g -> (getGroupId g, g)) $ concat
596 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g)) $ getGroupsFromLevel level phyloAncestor
597 in updatePhyloGroups level reBranched phylo
598 where
599 -- | 1) for each periods
600 periods :: [PhyloPeriodId]
601 periods = getPeriodIds phylo
602 -- --
603 level :: Level
604 level = getLastLevel phylo
605 -- --
606 frame :: Int
607 frame = getTimeFrame $ timeUnit $ getConfig phylo
608 -- | 2) find ancestors between groups without parents
609 mapGroups :: [[PhyloGroup]]
610 mapGroups = map (\prd ->
611 let groups = getGroupsFromLevelPeriods level [prd] phylo
612 childs = getPreviousChildIds level frame prd periods phylo
613 -- maybe add a better filter for non isolated ancestors
614 heads = filter (\g -> (not . null) $ (g ^. phylo_groupPeriodChilds))
615 $ filter (\g -> null (g ^. phylo_groupPeriodParents) && (notElem (getGroupId g) childs)) groups
616 noHeads = groups \\ heads
617 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) [prd]
618 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) [prd]
619 proximity = (phyloProximity $ getConfig phylo)
620 step = case getSeaElevation phylo of
621 Constante _ s -> s
622 Adaptative _ -> undefined
623 -- in headsToAncestors nbDocs diago proximity heads groups []
624 in map (\ego -> toAncestor nbDocs diago proximity step noHeads ego)
625 $ headsToAncestors nbDocs diago proximity step heads []
626 ) periods
627 -- | 3) process this task concurrently
628 newGroups :: [[PhyloGroup]]
629 newGroups = mapGroups `using` parList rdeepseq
630 --------------------------------------
631
632 getPreviousChildIds :: Level -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> Phylo -> [PhyloGroupId]
633 getPreviousChildIds lvl frame curr prds phylo =
634 concat $ map ((map fst) . _phylo_groupPeriodChilds)
635 $ getGroupsFromLevelPeriods lvl (getNextPeriods ToParents frame curr prds) phylo
636
637 ---------------------
638 -- | phyloExport | --
639 ---------------------
640
641 toPhyloExport :: Phylo -> DotGraph DotId
642 toPhyloExport phylo = exportToDot phylo
643 $ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
644 $ processSort (exportSort $ getConfig phylo) (getSeaElevation phylo)
645 $ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) (_phylo_lastTermFreq phylo)
646 $ processMetrics phylo export
647 where
648 export :: PhyloExport
649 export = PhyloExport groups branches
650 --------------------------------------
651 branches :: [PhyloBranch]
652 branches = map (\g ->
653 let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
654 breaks = (g ^. phylo_groupMeta) ! "breaks"
655 canonId = take (round $ (last' "export" breaks) + 2) (snd $ g ^. phylo_groupBranchId)
656 in PhyloBranch (g ^. phylo_groupBranchId)
657 canonId
658 seaLvl
659 0
660 (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl))
661 0
662 0
663 "" empty)
664 $ map (\gs -> head' "export" gs)
665 $ groupBy (\g g' -> g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
666 $ sortOn (\g -> g ^. phylo_groupBranchId) groups
667 --------------------------------------
668 groups :: [PhyloGroup]
669 groups = traceExportGroups
670 $ processDynamics
671 $ getGroupsFromLevel (phyloLevel $ getConfig phylo)
672 $ tracePhyloInfo phylo
673 -- \$ toHorizon phylo
674
675
676 traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
677 traceExportBranches branches = trace ("\n"
678 <> "-- | Export " <> show(length branches) <> " branches") branches
679
680 tracePhyloAncestors :: [[PhyloGroup]] -> [[PhyloGroup]]
681 tracePhyloAncestors groups = trace ( "-- | Found " <> show(length $ concat $ map _phylo_groupAncestors $ concat groups) <> " ancestors") groups
682
683 tracePhyloInfo :: Phylo -> Phylo
684 tracePhyloInfo phylo = trace ("\n" <> "##########################" <> "\n\n" <> "-- | Phylo with λ = "
685 <> show(_qua_granularity $ phyloQuality $ getConfig phylo) <> " applied to "
686 <> show(length $ Vector.toList $ getRoots phylo) <> " foundations"
687 ) phylo
688
689
690 traceExportGroups :: [PhyloGroup] -> [PhyloGroup]
691 traceExportGroups groups = trace ("\n" <> "-- | Export "
692 <> show(length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups) <> " branches, "
693 <> show(length groups) <> " groups and "
694 <> show(length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) groups) <> " terms"
695 ) groups
696