]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloExport.hs
[FEAT] Backend NLP French tested
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloExport.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloExport
3 Description : Exportation module of a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE TypeSynonymInstances #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloExport where
14
15 import Control.Lens hiding (Level)
16 import Control.Parallel.Strategies (parList, rdeepseq, using)
17 import Data.GraphViz hiding (DotGraph, Order)
18 import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
19 import Data.GraphViz.Types.Generalised (DotGraph)
20 import Data.GraphViz.Types.Monadic
21 import Data.List ((++), sort, nub, null, concat, sortOn, groupBy, union, (\\), (!!), init, partition, notElem, unwords, nubBy, inits, elemIndex)
22 import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList, member)
23 import Data.Text.Lazy (fromStrict, pack, unpack)
24 import Data.Vector (Vector)
25 import Debug.Trace (trace)
26 import Gargantext.Core.Viz.Phylo
27 import Gargantext.Core.Viz.Phylo.PhyloTools
28 import Gargantext.Core.Viz.Phylo.TemporalMatching (filterDocs, filterDiago, reduceDiagos, toProximity, getNextPeriods)
29 import Gargantext.Prelude
30 import Prelude (writeFile)
31 import System.FilePath
32 import qualified Data.GraphViz.Attributes.HTML as H
33 import qualified Data.Text as Text
34 import qualified Data.Text.Lazy as Lazy
35 import qualified Data.Vector as Vector
36
37 --------------------
38 -- | Dot export | --
39 --------------------
40
41 dotToFile :: FilePath -> DotGraph DotId -> IO ()
42 dotToFile filePath dotG = writeFile filePath $ dotToString dotG
43
44 dotToString :: DotGraph DotId -> [Char]
45 dotToString dotG = unpack (printDotGraph dotG)
46
47 dynamicToColor :: Double -> H.Attribute
48 dynamicToColor d
49 | d == 0 = H.BGColor (toColor LightCoral)
50 | d == 1 = H.BGColor (toColor Khaki)
51 | d == 2 = H.BGColor (toColor SkyBlue)
52 | otherwise = H.Color (toColor Black)
53
54 pickLabelColor :: [Double] -> H.Attribute
55 pickLabelColor lst
56 | elem 0 lst = dynamicToColor 0
57 | elem 2 lst = dynamicToColor 2
58 | elem 1 lst = dynamicToColor 1
59 | otherwise = dynamicToColor 3
60
61 toDotLabel :: Text.Text -> Label
62 toDotLabel lbl = StrLabel $ fromStrict lbl
63
64 toAttr :: AttributeName -> Lazy.Text -> CustomAttribute
65 toAttr k v = customAttribute k v
66
67 metaToAttr :: Map Text.Text [Double] -> [CustomAttribute]
68 metaToAttr meta = map (\(k,v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
69
70 groupIdToDotId :: PhyloGroupId -> DotId
71 groupIdToDotId (((d,d'),lvl),idx) = (fromStrict . Text.pack) $ ("group" <> (show d) <> (show d') <> (show lvl) <> (show idx))
72
73 branchIdToDotId :: PhyloBranchId -> DotId
74 branchIdToDotId bId = (fromStrict . Text.pack) $ ("branch" <> show (snd bId))
75
76 periodIdToDotId :: PhyloPeriodId -> DotId
77 periodIdToDotId prd = (fromStrict . Text.pack) $ ("period" <> show (fst prd) <> show (snd prd))
78
79 groupToTable :: Vector Ngrams -> PhyloGroup -> H.Label
80 groupToTable fdt g = H.Table H.HTable
81 { H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
82 , H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
83 , H.tableRows = [header]
84 <> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
85 <> ( map ngramsToRow $ splitEvery 4
86 $ reverse $ sortOn (snd . snd)
87 $ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
88 $ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
89 where
90 --------------------------------------
91 ngramsToRow :: [(Ngrams,(Double,Double))] -> H.Row
92 ngramsToRow ns = H.Cells $ map (\(n,(d,_)) ->
93 H.LabelCell [H.Align H.HLeft,dynamicToColor d] $ H.Text [H.Str $ fromStrict n]) ns
94 --------------------------------------
95 header :: H.Row
96 header =
97 H.Cells [ H.LabelCell [pickLabelColor ((g ^. phylo_groupMeta) ! "dynamics")]
98 $ H.Text [H.Str $ (((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
99 <> (fromStrict " ( ")
100 <> (pack $ show (fst $ g ^. phylo_groupPeriod))
101 <> (fromStrict " , ")
102 <> (pack $ show (snd $ g ^. phylo_groupPeriod))
103 <> (fromStrict " ) ")
104 <> (pack $ show (getGroupId g)))]]
105 --------------------------------------
106
107 branchToDotNode :: PhyloBranch -> Int -> Dot DotId
108 branchToDotNode b bId =
109 node (branchIdToDotId $ b ^. branch_id)
110 ([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ b ^. branch_label)]
111 <> (metaToAttr $ b ^. branch_meta)
112 <> [ toAttr "nodeType" "branch"
113 , toAttr "bId" (pack $ show bId)
114 , toAttr "branchId" (pack $ unwords (map show $ snd $ b ^. branch_id))
115 , toAttr "branch_x" (fromStrict $ Text.pack $ (show $ b ^. branch_x))
116 , toAttr "branch_y" (fromStrict $ Text.pack $ (show $ b ^. branch_y))
117 , toAttr "label" (pack $ show $ b ^. branch_label)
118 ])
119
120 periodToDotNode :: (Date,Date) -> (Text.Text,Text.Text) -> Dot DotId
121 periodToDotNode prd prd' =
122 node (periodIdToDotId prd)
123 ([Shape BoxShape, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
124 <> [ toAttr "nodeType" "period"
125 , toAttr "strFrom" (fromStrict $ Text.pack $ (show $ fst prd'))
126 , toAttr "strTo" (fromStrict $ Text.pack $ (show $ snd prd'))
127 , toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
128 , toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
129
130
131 groupToDotNode :: Vector Ngrams -> PhyloGroup -> Int -> Dot DotId
132 groupToDotNode fdt g bId =
133 node (groupIdToDotId $ getGroupId g)
134 ([FontName "Arial", Shape Square, penWidth 4, toLabel (groupToTable fdt g)]
135 <> [ toAttr "nodeType" "group"
136 , toAttr "gid" (groupIdToDotId $ getGroupId g)
137 , toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
138 , toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
139 , toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
140 , toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
141 , toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
142 , toAttr "bId" (pack $ show bId)
143 , toAttr "support" (pack $ show (g ^. phylo_groupSupport))
144 , toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
145 , toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
146 , toAttr "sourceFull" (pack $ show (g ^. phylo_groupSources))
147 , toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
148 , toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
149 , toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
150 , toAttr "frequence" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "frequence")))
151 , toAttr "seaLvl" (pack $ show ((g ^. phylo_groupMeta) ! "seaLevels"))
152 ])
153
154
155 toDotEdge' :: DotId -> DotId -> [Char] -> [Char] -> EdgeType -> Dot DotId
156 toDotEdge' source target thr w edgeType = edge source target
157 (case edgeType of
158 GroupToGroup -> undefined
159 GroupToGroupMemory -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True] <> [toAttr "edgeType" "memoryLink", toAttr "thr" (pack thr), toAttr "weight" (pack w)]
160 BranchToGroup -> undefined
161 BranchToBranch -> undefined
162 GroupToAncestor -> undefined
163 PeriodToPeriod -> undefined)
164
165
166 toDotEdge :: DotId -> DotId -> [Char] -> EdgeType -> Dot DotId
167 toDotEdge source target lbl edgeType = edge source target
168 (case edgeType of
169 GroupToGroup -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True] <> [toAttr "edgeType" "link", toAttr "lbl" (pack lbl), toAttr "source" source, toAttr "target" target]
170 GroupToGroupMemory -> undefined
171 BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])] <> [toAttr "edgeType" "branchLink" ]
172 BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])]
173 GroupToAncestor -> [ Width 3, Color [toWColor Red], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,NoArrow)]), PenWidth 4] <> [toAttr "edgeType" "ancestorLink", toAttr "lbl" (pack lbl), toAttr "source" source, toAttr "target" target]
174 PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
175
176
177 mergePointers :: [PhyloGroup] -> Map (PhyloGroupId,PhyloGroupId) Double
178 mergePointers groups =
179 let toChilds = fromList $ concat $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupPeriodChilds) groups
180 toParents = fromList $ concat $ map (\g -> map (\(target,w) -> ((target,getGroupId g),w)) $ g ^. phylo_groupPeriodParents) groups
181 in unionWith (\w w' -> max w w') toChilds toParents
182
183 mergePointersMemory :: [PhyloGroup] -> [((PhyloGroupId,PhyloGroupId),(Double,Double))]
184 mergePointersMemory groups =
185 let toChilds = concat $ map (\g -> map (\(target,(t,w)) -> ((getGroupId g,target),(t,w))) $ g ^. phylo_groupPeriodMemoryChilds) groups
186 toParents = concat $ map (\g -> map (\(target,(t,w)) -> ((target,getGroupId g),(t,w))) $ g ^. phylo_groupPeriodMemoryParents) groups
187 in concat [toChilds,toParents]
188
189 mergeAncestors :: [PhyloGroup] -> [((PhyloGroupId,PhyloGroupId), Double)]
190 mergeAncestors groups = concat
191 $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupAncestors)
192 $ filter (\g -> (not . null) $ g ^. phylo_groupAncestors) groups
193
194
195 toBid :: PhyloGroup -> [PhyloBranch] -> Int
196 toBid g bs =
197 let b' = head' "toBid" (filter (\b -> b ^. branch_id == g ^. phylo_groupBranchId) bs)
198 in fromJust $ elemIndex b' bs
199
200 exportToDot :: Phylo -> PhyloExport -> DotGraph DotId
201 exportToDot phylo export =
202 trace ("\n-- | Convert " <> show(length $ export ^. export_branches) <> " branches and "
203 <> show(length $ export ^. export_groups) <> " groups "
204 <> show(length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups) <> " terms to a dot file\n\n"
205 <> "##########################") $
206 digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do
207
208 {- 1) init the dot graph -}
209 graphAttrs ( [ Label (toDotLabel $ (phyloName $ getConfig phylo))]
210 <> [ FontSize 30, LabelLoc VTop, NodeSep 1, RankSep [1], Rank SameRank, Splines SplineEdges, Overlap ScaleOverlaps
211 , Ratio FillRatio
212 -- , Ratio AutoRatio
213 , Style [SItem Filled []],Color [toWColor White]]
214 {-- home made attributes -}
215 <> [(toAttr (fromStrict "phyloFoundations") $ pack $ show (length $ Vector.toList $ getRoots phylo))
216 ,(toAttr (fromStrict "phyloTerms") $ pack $ show (length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups))
217 ,(toAttr (fromStrict "phyloDocs") $ pack $ show (sum $ elems $ phylo ^. phylo_timeDocs))
218 ,(toAttr (fromStrict "phyloPeriods") $ pack $ show (length $ elems $ phylo ^. phylo_periods))
219 ,(toAttr (fromStrict "phyloBranches") $ pack $ show (length $ export ^. export_branches))
220 ,(toAttr (fromStrict "phyloGroups") $ pack $ show (length $ export ^. export_groups))
221 ,(toAttr (fromStrict "phyloSources") $ pack $ show (Vector.toList $ getSources phylo))
222 ,(toAttr (fromStrict "phyloTimeScale") $ pack $ getTimeScale phylo)
223 ,(toAttr (fromStrict "phyloLevel") $ pack $ show (_qua_granularity $ phyloQuality $ getConfig phylo))
224 ,(toAttr (fromStrict "phyloSeaRiseStart") $ pack $ show (_cons_start $ getSeaElevation phylo))
225 ,(toAttr (fromStrict "phyloSeaRiseSteps") $ pack $ show (_cons_step $ getSeaElevation phylo))
226 -- ,(toAttr (fromStrict "phyloTermsFreq") $ pack $ show (toList $ _phylo_lastTermFreq phylo))
227 ])
228
229 {-
230 -- toAttr (fromStrict k) $ (pack . unwords) $ map show v
231
232 -- 2) create a layer for the branches labels -}
233 subgraph (Str "Branches peaks") $ do
234
235 -- graphAttrs [Rank SameRank]
236 {-
237 -- 3) group the branches by hierarchy
238 -- mapM (\branches ->
239 -- subgraph (Str "Branches clade") $ do
240 -- graphAttrs [Rank SameRank]
241
242 -- -- 4) create a node for each branch
243 -- mapM branchToDotNode branches
244 -- ) $ elems $ fromListWith (++) $ map (\b -> ((init . snd) $ b ^. branch_id,[b])) $ export ^. export_branches
245 -}
246 mapM (\b -> branchToDotNode b (fromJust $ elemIndex b (export ^. export_branches))) $ export ^. export_branches
247
248 {-- 5) create a layer for each period -}
249 _ <- mapM (\period ->
250 subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst $ _phylo_periodPeriod period) <> show (snd $ _phylo_periodPeriod period))) $ do
251 graphAttrs [Rank SameRank]
252 periodToDotNode (period ^. phylo_periodPeriod) (period ^. phylo_periodPeriod')
253
254 {-- 6) create a node for each group -}
255 mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == (period ^. phylo_periodPeriod)) $ export ^. export_groups)
256 ) $ phylo ^. phylo_periods
257
258 {-- 7) create the edges between a branch and its first groups -}
259 _ <- mapM (\(bId,groups) ->
260 mapM (\g -> toDotEdge (branchIdToDotId bId) (groupIdToDotId $ getGroupId g) "" BranchToGroup) groups
261 )
262 $ toList
263 $ map (\groups -> head' "toDot"
264 $ groupBy (\g g' -> g' ^. phylo_groupPeriod == g ^. phylo_groupPeriod)
265 $ sortOn (fst . _phylo_groupPeriod) groups)
266 $ fromListWith (++) $ map (\g -> (g ^. phylo_groupBranchId,[g])) $ export ^. export_groups
267
268 {- 8) create the edges between the groups -}
269 _ <- mapM (\((k,k'),v) ->
270 toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToGroup
271 ) $ (toList . mergePointers) $ export ^. export_groups
272
273 {- 8-bis) create the edges between the groups -}
274 {- _ <- mapM (\((k,k'),v) ->
275 toDotEdge' (groupIdToDotId k) (groupIdToDotId k') (show (fst v)) (show (snd v)) GroupToGroupMemory
276 ) $ mergePointersMemory $ export ^. export_groups -}
277
278 _ <- mapM (\((k,k'),v) ->
279 toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToAncestor
280 ) $ mergeAncestors $ export ^. export_groups
281
282 -- 10) create the edges between the periods
283 _ <- mapM (\(prd,prd') ->
284 toDotEdge (periodIdToDotId prd) (periodIdToDotId prd') "" PeriodToPeriod
285 ) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ getPeriodIds phylo
286
287 {- 8) create the edges between the branches
288 -- _ <- mapM (\(bId,bId') ->
289 -- toDotEdge (branchIdToDotId bId) (branchIdToDotId bId')
290 -- (Text.pack $ show(branchIdsToProximity bId bId'
291 -- (getThresholdInit $ phyloProximity $ getConfig phylo)
292 -- (getThresholdStep $ phyloProximity $ getConfig phylo))) BranchToBranch
293 -- ) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ map _branch_id $ export ^. export_branches
294 -}
295
296
297 graphAttrs [Rank SameRank]
298
299
300 ----------------
301 -- | Filter | --
302 ----------------
303
304 filterByBranchSize :: Double -> PhyloExport -> PhyloExport
305 filterByBranchSize thr export =
306 let splited = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
307 in export & export_branches .~ (fst splited)
308 & export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd splited)))
309
310
311 processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
312 processFilters filters qua export =
313 foldl (\export' f -> case f of
314 ByBranchSize thr -> if (thr < (fromIntegral $ qua ^. qua_minBranch))
315 then filterByBranchSize (fromIntegral $ qua ^. qua_minBranch) export'
316 else filterByBranchSize thr export'
317 ) export filters
318
319 --------------
320 -- | Sort | --
321 --------------
322
323 branchToIso :: [PhyloBranch] -> [PhyloBranch]
324 branchToIso branches =
325 let steps = map sum
326 $ inits
327 $ map (\(b,x) -> b ^. branch_y + 0.05 - x)
328 $ zip branches
329 $ ([0] ++ (map (\(b,b') ->
330 let idx = length $ commonPrefix (b ^. branch_canonId) (b' ^. branch_canonId) []
331 lmin = min (length $ b ^. branch_seaLevel) (length $ b' ^. branch_seaLevel)
332 in
333 if ((idx - 1) > ((length $ b' ^. branch_seaLevel) - 1))
334 then (b' ^. branch_seaLevel) !! (lmin - 1)
335 else (b' ^. branch_seaLevel) !! (idx - 1)
336 ) $ listToSeq branches))
337 in map (\(x,b) -> b & branch_x .~ x)
338 $ zip steps branches
339
340 branchToIso' :: Double -> Double -> [PhyloBranch] -> [PhyloBranch]
341 branchToIso' start step branches =
342 let bx = map (\l -> (sum l) + ((fromIntegral $ length l) * 0.5))
343 $ inits
344 $ ([0] ++ (map (\(b,b') ->
345 let root = fromIntegral $ length $ commonPrefix (snd $ b ^. branch_id) (snd $ b' ^. branch_id) []
346 in 1 - start - step * root) $ listToSeq branches))
347 in map (\(x,b) -> b & branch_x .~ x)
348 $ zip bx branches
349
350
351 sortByHierarchy :: Int -> [PhyloBranch] -> [PhyloBranch]
352 sortByHierarchy depth branches =
353 if (length branches == 1)
354 then branches
355 else concat
356 $ map (\branches' ->
357 let partitions = partition (\b -> depth + 1 == ((length . snd) $ b ^. branch_id)) branches'
358 in (sortOn (\b -> (b ^. branch_meta) ! "birth") (fst partitions))
359 ++ (sortByHierarchy (depth + 1) (snd partitions)))
360 $ groupBy (\b b' -> ((take depth . snd) $ b ^. branch_id) == ((take depth . snd) $ b' ^. branch_id) )
361 $ sortOn (\b -> (take depth . snd) $ b ^. branch_id) branches
362
363
364 sortByBirthDate :: Order -> PhyloExport -> PhyloExport
365 sortByBirthDate order export =
366 let branches = sortOn (\b -> (b ^. branch_meta) ! "birth") $ export ^. export_branches
367 branches' = case order of
368 Asc -> branches
369 Desc -> reverse branches
370 in export & export_branches .~ branches'
371
372 processSort :: Sort -> SeaElevation -> PhyloExport -> PhyloExport
373 processSort sort' elev export = case sort' of
374 ByBirthDate o -> sortByBirthDate o export
375 ByHierarchy _ -> export & export_branches .~ (branchToIso' (_cons_start elev) (_cons_step elev)
376 $ sortByHierarchy 0 (export ^. export_branches))
377
378
379 -----------------
380 -- | Metrics | --
381 -----------------
382
383 -- | Return the conditional probability of i knowing j
384 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
385 conditional m i j = (findWithDefault 0 (i,j) m)
386 / (m ! (j,j))
387
388
389 -- | Return the genericity score of a given ngram
390 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
391 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
392 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
393
394
395 -- | Return the specificity score of a given ngram
396 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
397 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
398 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
399
400
401 -- | Return the inclusion score of a given ngram
402 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
403 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
404 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
405
406
407 ngramsMetrics :: Phylo -> PhyloExport -> PhyloExport
408 ngramsMetrics phylo export =
409 over ( export_groups
410 . traverse )
411 (\g -> g & phylo_groupMeta %~ insert "genericity"
412 (map (\n -> genericity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
413 & phylo_groupMeta %~ insert "specificity"
414 (map (\n -> specificity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
415 & phylo_groupMeta %~ insert "inclusion"
416 (map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
417 & phylo_groupMeta %~ insert "frequence"
418 (map (\n -> getInMap n (phylo ^. phylo_lastTermFreq)) $ g ^. phylo_groupNgrams)
419 ) export
420
421
422 branchDating :: PhyloExport -> PhyloExport
423 branchDating export =
424 over ( export_branches
425 . traverse )
426 (\b ->
427 let groups = sortOn fst
428 $ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id)
429 then acc ++ [g ^. phylo_groupPeriod]
430 else acc ) [] $ export ^. export_groups
431 periods = nub groups
432 birth = fst $ head' "birth" groups
433 age = (snd $ last' "age" groups) - birth
434 in b & branch_meta %~ insert "birth" [fromIntegral birth]
435 & branch_meta %~ insert "age" [fromIntegral age]
436 & branch_meta %~ insert "size" [fromIntegral $ length periods] ) export
437
438 processMetrics :: Phylo -> PhyloExport -> PhyloExport
439 processMetrics phylo export = ngramsMetrics phylo
440 $ branchDating export
441
442
443 -----------------
444 -- | Taggers | --
445 -----------------
446
447 nk :: Int -> [[Int]] -> Int
448 nk n groups = sum
449 $ map (\g -> if (elem n g)
450 then 1
451 else 0) groups
452
453
454 tf :: Int -> [[Int]] -> Double
455 tf n groups = (fromIntegral $ nk n groups) / (fromIntegral $ length $ concat groups)
456
457
458 idf :: Int -> [[Int]] -> Double
459 idf n groups = log ((fromIntegral $ length groups) / (fromIntegral $ nk n groups))
460
461
462 findTfIdf :: [[Int]] -> [(Int,Double)]
463 findTfIdf groups = reverse $ sortOn snd $ map (\n -> (n,(tf n groups) * (idf n groups))) $ sort $ nub $ concat groups
464
465
466 findEmergences :: [PhyloGroup] -> Map Int Double -> [(Int,Double)]
467 findEmergences groups freq =
468 let ngrams = map _phylo_groupNgrams groups
469 dynamics = map (\g -> (g ^. phylo_groupMeta) ! "dynamics") groups
470 emerging = nubBy (\n1 n2 -> fst n1 == fst n2)
471 $ concat $ map (\g -> filter (\(_,d) -> d == 0) $ zip (fst g) (snd g)) $ zip ngrams dynamics
472 in reverse $ sortOn snd
473 $ map (\(n,_) -> if (member n freq)
474 then (n,freq ! n)
475 else (n,0)) emerging
476
477
478 mostEmergentTfIdf :: Int -> Map Int Double -> Vector Ngrams -> PhyloExport -> PhyloExport
479 mostEmergentTfIdf nth freq foundations export =
480 over ( export_branches
481 . traverse )
482 (\b ->
483 let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
484 tfidf = findTfIdf (map _phylo_groupNgrams groups)
485 emergences = findEmergences groups freq
486 selected = if (null emergences)
487 then map fst $ take nth tfidf
488 else [fst $ head' "mostEmergentTfIdf" emergences]
489 ++ (map fst $ take (nth - 1) $ filter (\(n,_) -> n /= (fst $ head' "mostEmergentTfIdf" emergences)) tfidf)
490 in b & branch_label .~ (ngramsToLabel foundations selected)) export
491
492
493 getNthMostMeta :: Int -> [Double] -> [Int] -> [Int]
494 getNthMostMeta nth meta ns = map (\(idx,_) -> (ns !! idx))
495 $ take nth
496 $ reverse
497 $ sortOn snd $ zip [0..] meta
498
499
500 mostInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
501 mostInclusive nth foundations export =
502 over ( export_branches
503 . traverse )
504 (\b ->
505 let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
506 cooc = foldl (\acc g -> unionWith (+) acc (g ^. phylo_groupCooc)) empty groups
507 ngrams = sort $ foldl (\acc g -> union acc (g ^. phylo_groupNgrams)) [] groups
508 inc = map (\n -> inclusion cooc (ngrams \\ [n]) n) ngrams
509 lbl = ngramsToLabel foundations $ getNthMostMeta nth inc ngrams
510 in b & branch_label .~ lbl ) export
511
512
513 mostEmergentInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
514 mostEmergentInclusive nth foundations export =
515 over ( export_groups
516 . traverse )
517 (\g ->
518 let lbl = ngramsToLabel foundations
519 $ take nth
520 $ map (\(_,(_,idx)) -> idx)
521 $ concat
522 $ map (\groups -> sortOn (fst . snd) groups)
523 $ groupBy ((==) `on` fst) $ reverse $ sortOn fst
524 $ zip ((g ^. phylo_groupMeta) ! "inclusion")
525 $ zip ((g ^. phylo_groupMeta) ! "dynamics") (g ^. phylo_groupNgrams)
526 in g & phylo_groupLabel .~ lbl ) export
527
528
529 processLabels :: [PhyloLabel] -> Vector Ngrams -> Map Int Double -> PhyloExport -> PhyloExport
530 processLabels labels foundations freq export =
531 foldl (\export' label ->
532 case label of
533 GroupLabel tagger nth ->
534 case tagger of
535 MostEmergentInclusive -> mostEmergentInclusive nth foundations export'
536 _ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger"
537 BranchLabel tagger nth ->
538 case tagger of
539 MostInclusive -> mostInclusive nth foundations export'
540 MostEmergentTfIdf -> mostEmergentTfIdf nth freq foundations export'
541 _ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger" ) export labels
542
543
544 ------------------
545 -- | Dynamics | --
546 ------------------
547
548
549 toDynamics :: Int -> [PhyloGroup] -> PhyloGroup -> Map Int (Date,Date) -> Double
550 toDynamics n parents g m =
551 let prd = g ^. phylo_groupPeriod
552 end = last' "dynamics" (sort $ map snd $ elems m)
553 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
554 {- decrease -}
555 then 2
556 else if ((fst prd) == (fst $ m ! n))
557 {- emerging -}
558 then 0
559 else if isNew
560 {- emergence -}
561 then 1
562 else 3
563 where
564 --------------------------------------
565 isNew :: Bool
566 isNew = not $ elem n $ concat $ map _phylo_groupNgrams parents
567
568
569 processDynamics :: [PhyloGroup] -> [PhyloGroup]
570 processDynamics groups =
571 map (\g ->
572 let parents = filter (\g' -> (g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
573 && ((fst $ g ^. phylo_groupPeriod) > (fst $ g' ^. phylo_groupPeriod))) groups
574 in g & phylo_groupMeta %~ insert "dynamics" (map (\n -> toDynamics n parents g mapNgrams) $ g ^. phylo_groupNgrams) ) groups
575 where
576 --------------------------------------
577 mapNgrams :: Map Int (Date,Date)
578 mapNgrams = map (\dates ->
579 let dates' = sort dates
580 in (head' "dynamics" dates', last' "dynamics" dates'))
581 $ fromListWith (++)
582 $ foldl (\acc g -> acc ++ ( map (\n -> (n,[fst $ g ^. phylo_groupPeriod, snd $ g ^. phylo_groupPeriod]))
583 $ (g ^. phylo_groupNgrams))) [] groups
584
585
586 -----------------
587 -- | horizon | --
588 -----------------
589
590 getGroupThr :: Double -> PhyloGroup -> Double
591 getGroupThr step g =
592 let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
593 breaks = (g ^. phylo_groupMeta) ! "breaks"
594 in (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl)) - step
595
596 toAncestor :: Double -> Map Int Double -> Proximity -> Double -> [PhyloGroup] -> PhyloGroup -> PhyloGroup
597 toAncestor nbDocs diago proximity step candidates ego =
598 let curr = ego ^. phylo_groupAncestors
599 in ego & phylo_groupAncestors .~ (curr ++ (map (\(g,w) -> (getGroupId g,w))
600 $ filter (\(g,w) -> (w > 0) && (w >= (min (getGroupThr step ego) (getGroupThr step g))))
601 $ map (\g -> (g, toProximity nbDocs diago proximity (ego ^. phylo_groupNgrams) (g ^. phylo_groupNgrams) (g ^. phylo_groupNgrams)))
602 $ filter (\g -> g ^. phylo_groupBranchId /= ego ^. phylo_groupBranchId ) candidates))
603
604
605 headsToAncestors :: Double -> Map Int Double -> Proximity -> Double -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
606 headsToAncestors nbDocs diago proximity step heads acc =
607 if (null heads)
608 then acc
609 else
610 let ego = head' "headsToAncestors" heads
611 heads' = tail' "headsToAncestors" heads
612 in headsToAncestors nbDocs diago proximity step heads' (acc ++ [toAncestor nbDocs diago proximity step heads' ego])
613
614
615 toHorizon :: Phylo -> Phylo
616 toHorizon phylo =
617 let phyloAncestor = updatePhyloGroups
618 level
619 (fromList $ map (\g -> (getGroupId g, g))
620 $ concat
621 $ tracePhyloAncestors newGroups) phylo
622 reBranched = fromList $ map (\g -> (getGroupId g, g)) $ concat
623 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g)) $ getGroupsFromLevel level phyloAncestor
624 in updatePhyloGroups level reBranched phylo
625 where
626 -- | 1) for each periods
627 periods :: [PhyloPeriodId]
628 periods = getPeriodIds phylo
629 -- --
630 level :: Level
631 level = getLastLevel phylo
632 -- --
633 frame :: Int
634 frame = getTimeFrame $ timeUnit $ getConfig phylo
635 -- | 2) find ancestors between groups without parents
636 mapGroups :: [[PhyloGroup]]
637 mapGroups = map (\prd ->
638 let groups = getGroupsFromLevelPeriods level [prd] phylo
639 childs = getPreviousChildIds level frame prd periods phylo
640 -- maybe add a better filter for non isolated ancestors
641 heads = filter (\g -> (not . null) $ (g ^. phylo_groupPeriodChilds))
642 $ filter (\g -> null (g ^. phylo_groupPeriodParents) && (notElem (getGroupId g) childs)) groups
643 noHeads = groups \\ heads
644 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) [prd]
645 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) [prd]
646 proximity = (phyloProximity $ getConfig phylo)
647 step = case getSeaElevation phylo of
648 Constante _ s -> s
649 Adaptative _ -> undefined
650 -- in headsToAncestors nbDocs diago proximity heads groups []
651 in map (\ego -> toAncestor nbDocs diago proximity step noHeads ego)
652 $ headsToAncestors nbDocs diago proximity step heads []
653 ) periods
654 -- | 3) process this task concurrently
655 newGroups :: [[PhyloGroup]]
656 newGroups = mapGroups `using` parList rdeepseq
657 --------------------------------------
658
659 getPreviousChildIds :: Level -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> Phylo -> [PhyloGroupId]
660 getPreviousChildIds lvl frame curr prds phylo =
661 concat $ map ((map fst) . _phylo_groupPeriodChilds)
662 $ getGroupsFromLevelPeriods lvl (getNextPeriods ToParents frame curr prds) phylo
663
664 ---------------------
665 -- | phyloExport | --
666 ---------------------
667
668 toPhyloExport :: Phylo -> DotGraph DotId
669 toPhyloExport phylo = exportToDot phylo
670 $ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
671 $ processSort (exportSort $ getConfig phylo) (getSeaElevation phylo)
672 $ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) (_phylo_lastTermFreq phylo)
673 $ processMetrics phylo export
674 where
675 export :: PhyloExport
676 export = PhyloExport groups branches
677 --------------------------------------
678 branches :: [PhyloBranch]
679 branches = map (\g ->
680 let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
681 breaks = (g ^. phylo_groupMeta) ! "breaks"
682 canonId = take (round $ (last' "export" breaks) + 2) (snd $ g ^. phylo_groupBranchId)
683 in PhyloBranch (g ^. phylo_groupBranchId)
684 canonId
685 seaLvl
686 0
687 (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl))
688 0
689 0
690 "" empty)
691 $ map (\gs -> head' "export" gs)
692 $ groupBy (\g g' -> g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
693 $ sortOn (\g -> g ^. phylo_groupBranchId) groups
694 --------------------------------------
695 groups :: [PhyloGroup]
696 groups = traceExportGroups
697 $ processDynamics
698 $ getGroupsFromLevel (phyloLevel $ getConfig phylo)
699 $ tracePhyloInfo phylo
700 -- \$ toHorizon phylo
701
702
703 traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
704 traceExportBranches branches = trace ("\n"
705 <> "-- | Export " <> show(length branches) <> " branches") branches
706
707 tracePhyloAncestors :: [[PhyloGroup]] -> [[PhyloGroup]]
708 tracePhyloAncestors groups = trace ( "-- | Found " <> show(length $ concat $ map _phylo_groupAncestors $ concat groups) <> " ancestors") groups
709
710 tracePhyloInfo :: Phylo -> Phylo
711 tracePhyloInfo phylo = trace ("\n" <> "##########################" <> "\n\n" <> "-- | Phylo with λ = "
712 <> show(_qua_granularity $ phyloQuality $ getConfig phylo) <> " applied to "
713 <> show(length $ Vector.toList $ getRoots phylo) <> " foundations"
714 ) phylo
715
716
717 traceExportGroups :: [PhyloGroup] -> [PhyloGroup]
718 traceExportGroups groups = trace ("\n" <> "-- | Export "
719 <> show(length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups) <> " branches, "
720 <> show(length groups) <> " groups and "
721 <> show(length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) groups) <> " terms"
722 ) groups
723