]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloExport.hs
[ngrams] try terms query without the materialized view
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloExport.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloExport
3 Description : Exportation module of a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE TypeSynonymInstances #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloExport where
14
15 import Control.Lens hiding (Level)
16 import Control.Parallel.Strategies (parList, rdeepseq, using)
17 import Data.GraphViz hiding (DotGraph, Order)
18 import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
19 import Data.GraphViz.Types.Generalised (DotGraph)
20 import Data.GraphViz.Types.Monadic
21 import Data.List ((++), sort, nub, null, concat, sortOn, groupBy, union, (\\), (!!), init, partition, notElem, unwords, nubBy, inits, elemIndex)
22 import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList, member)
23 import Data.Text.Lazy (fromStrict, pack, unpack)
24 import Data.Vector (Vector)
25 import Debug.Trace (trace)
26 import Gargantext.Core.Viz.Phylo
27 import Gargantext.Core.Viz.Phylo.PhyloTools
28 import Gargantext.Core.Viz.Phylo.TemporalMatching (filterDocs, filterDiago, reduceDiagos, toSimilarity, getNextPeriods)
29 import Gargantext.Prelude hiding (scale)
30 import Prelude (writeFile)
31 import System.FilePath
32 import qualified Data.GraphViz.Attributes.HTML as H
33 import qualified Data.Text as Text
34 import qualified Data.Text.Lazy as Lazy
35 import qualified Data.Vector as Vector
36
37 --------------------
38 -- | Dot export | --
39 --------------------
40
41 dotToFile :: FilePath -> DotGraph DotId -> IO ()
42 dotToFile filePath dotG = writeFile filePath $ dotToString dotG
43
44 dotToString :: DotGraph DotId -> [Char]
45 dotToString dotG = unpack (printDotGraph dotG)
46
47 dynamicToColor :: Double -> H.Attribute
48 dynamicToColor d
49 | d == 0 = H.BGColor (toColor LightCoral)
50 | d == 1 = H.BGColor (toColor Khaki)
51 | d == 2 = H.BGColor (toColor SkyBlue)
52 | otherwise = H.Color (toColor Black)
53
54 pickLabelColor :: [Double] -> H.Attribute
55 pickLabelColor lst
56 | elem 0 lst = dynamicToColor 0
57 | elem 2 lst = dynamicToColor 2
58 | elem 1 lst = dynamicToColor 1
59 | otherwise = dynamicToColor 3
60
61 toDotLabel :: Text.Text -> Label
62 toDotLabel lbl = StrLabel $ fromStrict lbl
63
64 toAttr :: AttributeName -> Lazy.Text -> CustomAttribute
65 toAttr k v = customAttribute k v
66
67 metaToAttr :: Map Text.Text [Double] -> [CustomAttribute]
68 metaToAttr meta = map (\(k,v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
69
70 groupIdToDotId :: PhyloGroupId -> DotId
71 groupIdToDotId (((d,d'),lvl),idx) = (fromStrict . Text.pack) $ ("group" <> (show d) <> (show d') <> (show lvl) <> (show idx))
72
73 branchIdToDotId :: PhyloBranchId -> DotId
74 branchIdToDotId bId = (fromStrict . Text.pack) $ ("branch" <> show (snd bId))
75
76 periodIdToDotId :: Period -> DotId
77 periodIdToDotId prd = (fromStrict . Text.pack) $ ("period" <> show (fst prd) <> show (snd prd))
78
79 groupToTable :: Vector Ngrams -> PhyloGroup -> H.Label
80 groupToTable fdt g = H.Table H.HTable
81 { H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
82 , H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
83 , H.tableRows = [header]
84 <> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
85 <> ( map ngramsToRow $ splitEvery 4
86 $ reverse $ sortOn (snd . snd)
87 $ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
88 $ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
89 where
90 --------------------------------------
91 ngramsToRow :: [(Ngrams,(Double,Double))] -> H.Row
92 ngramsToRow ns = H.Cells $ map (\(n,(d,_)) ->
93 H.LabelCell [H.Align H.HLeft,dynamicToColor d] $ H.Text [H.Str $ fromStrict n]) ns
94 --------------------------------------
95 header :: H.Row
96 header =
97 H.Cells [ H.LabelCell [pickLabelColor ((g ^. phylo_groupMeta) ! "dynamics")]
98 $ H.Text [H.Str $ (((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
99 <> (fromStrict " ( ")
100 <> (pack $ show (fst $ g ^. phylo_groupPeriod))
101 <> (fromStrict " , ")
102 <> (pack $ show (snd $ g ^. phylo_groupPeriod))
103 <> (fromStrict " ) ")
104 <> (pack $ show (getGroupId g)))]]
105 --------------------------------------
106
107 branchToDotNode :: PhyloBranch -> Int -> Dot DotId
108 branchToDotNode b bId =
109 node (branchIdToDotId $ b ^. branch_id)
110 ([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ b ^. branch_label)]
111 <> (metaToAttr $ b ^. branch_meta)
112 <> [ toAttr "nodeType" "branch"
113 , toAttr "bId" (pack $ show bId)
114 , toAttr "branchId" (pack $ unwords (map show $ snd $ b ^. branch_id))
115 , toAttr "branch_x" (fromStrict $ Text.pack $ (show $ b ^. branch_x))
116 , toAttr "branch_y" (fromStrict $ Text.pack $ (show $ b ^. branch_y))
117 , toAttr "label" (pack $ show $ b ^. branch_label)
118 ])
119
120 periodToDotNode :: (Date,Date) -> (Text.Text,Text.Text) -> Dot DotId
121 periodToDotNode prd prd' =
122 node (periodIdToDotId prd)
123 ([Shape BoxShape, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
124 <> [ toAttr "nodeType" "period"
125 , toAttr "strFrom" (fromStrict $ Text.pack $ (show $ fst prd'))
126 , toAttr "strTo" (fromStrict $ Text.pack $ (show $ snd prd'))
127 , toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
128 , toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
129
130
131 groupToDotNode :: Vector Ngrams -> PhyloGroup -> Int -> Dot DotId
132 groupToDotNode fdt g bId =
133 node (groupIdToDotId $ getGroupId g)
134 ([FontName "Arial", Shape Square, penWidth 4, toLabel (groupToTable fdt g)]
135 <> [ toAttr "nodeType" "group"
136 , toAttr "gid" (groupIdToDotId $ getGroupId g)
137 , toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
138 , toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
139 , toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
140 , toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
141 , toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
142 , toAttr "bId" (pack $ show bId)
143 , toAttr "support" (pack $ show (g ^. phylo_groupSupport))
144 , toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
145 , toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
146 , toAttr "sourceFull" (pack $ show (g ^. phylo_groupSources))
147 , toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
148 , toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
149 , toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
150 , toAttr "frequence" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "frequence")))
151 , toAttr "seaLvl" (pack $ show ((g ^. phylo_groupMeta) ! "seaLevels"))
152 ])
153
154
155 toDotEdge' :: DotId -> DotId -> [Char] -> [Char] -> EdgeType -> Dot DotId
156 toDotEdge' source target thr w edgeType = edge source target
157 (case edgeType of
158 GroupToGroup -> undefined
159 GroupToGroupMemory -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True] <> [toAttr "edgeType" "memoryLink", toAttr "thr" (pack thr), toAttr "weight" (pack w)]
160 BranchToGroup -> undefined
161 BranchToBranch -> undefined
162 GroupToAncestor -> undefined
163 PeriodToPeriod -> undefined)
164
165
166 toDotEdge :: DotId -> DotId -> [Char] -> EdgeType -> Dot DotId
167 toDotEdge source target lbl edgeType = edge source target
168 (case edgeType of
169 GroupToGroup -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True] <> [toAttr "edgeType" "link", toAttr "lbl" (pack lbl), toAttr "source" source, toAttr "target" target]
170 GroupToGroupMemory -> undefined
171 BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])] <> [toAttr "edgeType" "branchLink" ]
172 BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])]
173 GroupToAncestor -> [ Width 3, Color [toWColor Red], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,NoArrow)]), PenWidth 4] <> [toAttr "edgeType" "ancestorLink", toAttr "lbl" (pack lbl), toAttr "source" source, toAttr "target" target]
174 PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
175
176
177 mergePointers :: [PhyloGroup] -> Map (PhyloGroupId,PhyloGroupId) Double
178 mergePointers groups =
179 let toChilds = fromList $ concat $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupPeriodChilds) groups
180 toParents = fromList $ concat $ map (\g -> map (\(target,w) -> ((target,getGroupId g),w)) $ g ^. phylo_groupPeriodParents) groups
181 in unionWith (\w w' -> max w w') toChilds toParents
182
183 mergePointersMemory :: [PhyloGroup] -> [((PhyloGroupId,PhyloGroupId),(Double,Double))]
184 mergePointersMemory groups =
185 let toChilds = concat $ map (\g -> map (\(target,(t,w)) -> ((getGroupId g,target),(t,w))) $ g ^. phylo_groupPeriodMemoryChilds) groups
186 toParents = concat $ map (\g -> map (\(target,(t,w)) -> ((target,getGroupId g),(t,w))) $ g ^. phylo_groupPeriodMemoryParents) groups
187 in concat [toChilds,toParents]
188
189 mergeAncestors :: [PhyloGroup] -> [((PhyloGroupId,PhyloGroupId), Double)]
190 mergeAncestors groups = concat
191 $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupAncestors)
192 $ filter (\g -> (not . null) $ g ^. phylo_groupAncestors) groups
193
194
195 toBid :: PhyloGroup -> [PhyloBranch] -> Int
196 toBid g bs =
197 let b' = head' "toBid" (filter (\b -> b ^. branch_id == g ^. phylo_groupBranchId) bs)
198 in fromJust $ elemIndex b' bs
199
200 exportToDot :: Phylo -> PhyloExport -> DotGraph DotId
201 exportToDot phylo export =
202 trace ("\n-- | Convert " <> show(length $ export ^. export_branches) <> " branches and "
203 <> show(length $ export ^. export_groups) <> " groups "
204 <> show(length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups) <> " terms to a dot file\n\n"
205 <> "##########################") $
206 digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do
207
208 {- 1) init the dot graph -}
209 graphAttrs ( [ Label (toDotLabel $ (phyloName $ getConfig phylo))]
210 <> [ FontSize 30, LabelLoc VTop, NodeSep 1, RankSep [1], Rank SameRank, Splines SplineEdges, Overlap ScaleOverlaps
211 , Ratio FillRatio
212 -- , Ratio AutoRatio
213 , Style [SItem Filled []],Color [toWColor White]]
214 {-- home made attributes -}
215 <> [(toAttr (fromStrict "phyloFoundations") $ pack $ show (length $ Vector.toList $ getRoots phylo))
216 ,(toAttr (fromStrict "phyloTerms") $ pack $ show (length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups))
217 ,(toAttr (fromStrict "phyloDocs") $ pack $ show (sum $ elems $ getDocsByDate phylo))
218 ,(toAttr (fromStrict "phyloPeriods") $ pack $ show (length $ elems $ phylo ^. phylo_periods))
219 ,(toAttr (fromStrict "phyloBranches") $ pack $ show (length $ export ^. export_branches))
220 ,(toAttr (fromStrict "phyloGroups") $ pack $ show (length $ export ^. export_groups))
221 ,(toAttr (fromStrict "phyloSources") $ pack $ show (Vector.toList $ getSources phylo))
222 ,(toAttr (fromStrict "phyloTimeScale") $ pack $ getTimeScale phylo)
223 ,(toAttr (fromStrict "PhyloScale") $ pack $ show (getLevel phylo))
224 ,(toAttr (fromStrict "phyloQuality") $ pack $ show (phylo ^. phylo_quality))
225 ,(toAttr (fromStrict "phyloSeaRiseStart") $ pack $ show (getPhyloSeaRiseStart phylo))
226 ,(toAttr (fromStrict "phyloSeaRiseSteps") $ pack $ show (getPhyloSeaRiseSteps phylo))
227 -- ,(toAttr (fromStrict "phyloTermsFreq") $ pack $ show (toList $ _phylo_lastTermFreq phylo))
228 ])
229
230 {-
231 -- toAttr (fromStrict k) $ (pack . unwords) $ map show v
232
233 -- 2) create a layer for the branches labels -}
234 subgraph (Str "Branches peaks") $ do
235
236 -- graphAttrs [Rank SameRank]
237 {-
238 -- 3) group the branches by hierarchy
239 -- mapM (\branches ->
240 -- subgraph (Str "Branches clade") $ do
241 -- graphAttrs [Rank SameRank]
242
243 -- -- 4) create a node for each branch
244 -- mapM branchToDotNode branches
245 -- ) $ elems $ fromListWith (++) $ map (\b -> ((init . snd) $ b ^. branch_id,[b])) $ export ^. export_branches
246 -}
247 mapM (\b -> branchToDotNode b (fromJust $ elemIndex b (export ^. export_branches))) $ export ^. export_branches
248
249 {-- 5) create a layer for each period -}
250 _ <- mapM (\period ->
251 subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst $ _phylo_periodPeriod period) <> show (snd $ _phylo_periodPeriod period))) $ do
252 graphAttrs [Rank SameRank]
253 periodToDotNode (period ^. phylo_periodPeriod) (period ^. phylo_periodPeriodStr)
254
255 {-- 6) create a node for each group -}
256 mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == (period ^. phylo_periodPeriod)) $ export ^. export_groups)
257 ) $ phylo ^. phylo_periods
258
259 {-- 7) create the edges between a branch and its first groups -}
260 _ <- mapM (\(bId,groups) ->
261 mapM (\g -> toDotEdge (branchIdToDotId bId) (groupIdToDotId $ getGroupId g) "" BranchToGroup) groups
262 )
263 $ toList
264 $ map (\groups -> head' "toDot"
265 $ groupBy (\g g' -> g' ^. phylo_groupPeriod == g ^. phylo_groupPeriod)
266 $ sortOn (fst . _phylo_groupPeriod) groups)
267 $ fromListWith (++) $ map (\g -> (g ^. phylo_groupBranchId,[g])) $ export ^. export_groups
268
269 {- 8) create the edges between the groups -}
270 _ <- mapM (\((k,k'),v) ->
271 toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToGroup
272 ) $ (toList . mergePointers) $ export ^. export_groups
273
274 {- 8-bis) create the edges between the groups -}
275 {- _ <- mapM (\((k,k'),v) ->
276 toDotEdge' (groupIdToDotId k) (groupIdToDotId k') (show (fst v)) (show (snd v)) GroupToGroupMemory
277 ) $ mergePointersMemory $ export ^. export_groups -}
278
279 _ <- mapM (\((k,k'),v) ->
280 toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToAncestor
281 ) $ mergeAncestors $ export ^. export_groups
282
283 -- 10) create the edges between the periods
284 _ <- mapM (\(prd,prd') ->
285 toDotEdge (periodIdToDotId prd) (periodIdToDotId prd') "" PeriodToPeriod
286 ) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ getPeriodIds phylo
287
288 {- 8) create the edges between the branches
289 -- _ <- mapM (\(bId,bId') ->
290 -- toDotEdge (branchIdToDotId bId) (branchIdToDotId bId')
291 -- (Text.pack $ show(branchIdsToSimilarity bId bId'
292 -- (getThresholdInit $ phyloSimilarity $ getConfig phylo)
293 -- (getThresholdStep $ phyloSimilarity $ getConfig phylo))) BranchToBranch
294 -- ) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ map _branch_id $ export ^. export_branches
295 -}
296
297
298 graphAttrs [Rank SameRank]
299
300
301 ----------------
302 -- | Filter | --
303 ----------------
304
305 filterByBranchSize :: Double -> PhyloExport -> PhyloExport
306 filterByBranchSize thr export =
307 let splited = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
308 in export & export_branches .~ (fst splited)
309 & export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd splited)))
310
311
312 processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
313 processFilters filters qua export =
314 foldl (\export' f -> case f of
315 ByBranchSize thr -> if (thr < (fromIntegral $ qua ^. qua_minBranch))
316 then filterByBranchSize (fromIntegral $ qua ^. qua_minBranch) export'
317 else filterByBranchSize thr export'
318 ) export filters
319
320 --------------
321 -- | Sort | --
322 --------------
323
324 branchToIso :: [PhyloBranch] -> [PhyloBranch]
325 branchToIso branches =
326 let steps = map sum
327 $ inits
328 $ map (\(b,x) -> b ^. branch_y + 0.05 - x)
329 $ zip branches
330 $ ([0] ++ (map (\(b,b') ->
331 let idx = length $ commonPrefix (b ^. branch_canonId) (b' ^. branch_canonId) []
332 lmin = min (length $ b ^. branch_seaLevel) (length $ b' ^. branch_seaLevel)
333 in
334 if ((idx - 1) > ((length $ b' ^. branch_seaLevel) - 1))
335 then (b' ^. branch_seaLevel) !! (lmin - 1)
336 else (b' ^. branch_seaLevel) !! (idx - 1)
337 ) $ listToSeq branches))
338 in map (\(x,b) -> b & branch_x .~ x)
339 $ zip steps branches
340
341 branchToIso' :: Double -> Double -> [PhyloBranch] -> [PhyloBranch]
342 branchToIso' start step branches =
343 let bx = map (\l -> (sum l) + ((fromIntegral $ length l) * 0.5))
344 $ inits
345 $ ([0] ++ (map (\(b,b') ->
346 let root = fromIntegral $ length $ commonPrefix (snd $ b ^. branch_id) (snd $ b' ^. branch_id) []
347 in 1 - start - step * root) $ listToSeq branches))
348 in map (\(x,b) -> b & branch_x .~ x)
349 $ zip bx branches
350
351
352 sortByHierarchy :: Int -> [PhyloBranch] -> [PhyloBranch]
353 sortByHierarchy depth branches =
354 if (length branches == 1)
355 then branches
356 else concat
357 $ map (\branches' ->
358 let partitions = partition (\b -> depth + 1 == ((length . snd) $ b ^. branch_id)) branches'
359 in (sortOn (\b -> (b ^. branch_meta) ! "birth") (fst partitions))
360 ++ (sortByHierarchy (depth + 1) (snd partitions)))
361 $ groupBy (\b b' -> ((take depth . snd) $ b ^. branch_id) == ((take depth . snd) $ b' ^. branch_id) )
362 $ sortOn (\b -> (take depth . snd) $ b ^. branch_id) branches
363
364
365 sortByBirthDate :: Order -> PhyloExport -> PhyloExport
366 sortByBirthDate order export =
367 let branches = sortOn (\b -> (b ^. branch_meta) ! "birth") $ export ^. export_branches
368 branches' = case order of
369 Asc -> branches
370 Desc -> reverse branches
371 in export & export_branches .~ branches'
372
373 processSort :: Sort -> SeaElevation -> PhyloExport -> PhyloExport
374 processSort sort' elev export = case sort' of
375 ByBirthDate o -> sortByBirthDate o export
376 ByHierarchy _ -> case elev of
377 Constante s s' -> export & export_branches .~ (branchToIso' s s' $ sortByHierarchy 0 (export ^. export_branches))
378 Adaptative _ -> export & export_branches .~ (branchToIso $ sortByHierarchy 0 (export ^. export_branches))
379 Evolving _ -> export & export_branches .~ (branchToIso $ sortByHierarchy 0 (export ^. export_branches))
380
381 -----------------
382 -- | Metrics | --
383 -----------------
384
385 -- | Return the conditional probability of i knowing j
386 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
387 conditional m i j = (findWithDefault 0 (i,j) m)
388 / (m ! (j,j))
389
390
391 -- | Return the genericity score of a given ngram
392 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
393 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
394 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
395
396
397 -- | Return the specificity score of a given ngram
398 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
399 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
400 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
401
402
403 -- | Return the inclusion score of a given ngram
404 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
405 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
406 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
407
408
409 ngramsMetrics :: Phylo -> PhyloExport -> PhyloExport
410 ngramsMetrics phylo export =
411 over ( export_groups
412 . traverse )
413 (\g -> g & phylo_groupMeta %~ insert "genericity"
414 (map (\n -> genericity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
415 & phylo_groupMeta %~ insert "specificity"
416 (map (\n -> specificity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
417 & phylo_groupMeta %~ insert "inclusion"
418 (map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
419 & phylo_groupMeta %~ insert "frequence"
420 (map (\n -> getInMap n (getLastRootsFreq phylo)) $ g ^. phylo_groupNgrams)
421 ) export
422
423
424 branchDating :: PhyloExport -> PhyloExport
425 branchDating export =
426 over ( export_branches
427 . traverse )
428 (\b ->
429 let groups = sortOn fst
430 $ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id)
431 then acc ++ [g ^. phylo_groupPeriod]
432 else acc ) [] $ export ^. export_groups
433 periods = nub groups
434 birth = fst $ head' "birth" groups
435 age = (snd $ last' "age" groups) - birth
436 in b & branch_meta %~ insert "birth" [fromIntegral birth]
437 & branch_meta %~ insert "age" [fromIntegral age]
438 & branch_meta %~ insert "size" [fromIntegral $ length periods] ) export
439
440 processMetrics :: Phylo -> PhyloExport -> PhyloExport
441 processMetrics phylo export = ngramsMetrics phylo
442 $ branchDating export
443
444
445 -----------------
446 -- | Taggers | --
447 -----------------
448
449 nk :: Int -> [[Int]] -> Int
450 nk n groups = sum
451 $ map (\g -> if (elem n g)
452 then 1
453 else 0) groups
454
455
456 tf :: Int -> [[Int]] -> Double
457 tf n groups = (fromIntegral $ nk n groups) / (fromIntegral $ length $ concat groups)
458
459
460 idf :: Int -> [[Int]] -> Double
461 idf n groups = log ((fromIntegral $ length groups) / (fromIntegral $ nk n groups))
462
463
464 findTfIdf :: [[Int]] -> [(Int,Double)]
465 findTfIdf groups = reverse $ sortOn snd $ map (\n -> (n,(tf n groups) * (idf n groups))) $ sort $ nub $ concat groups
466
467
468 findEmergences :: [PhyloGroup] -> Map Int Double -> [(Int,Double)]
469 findEmergences groups freq =
470 let ngrams = map _phylo_groupNgrams groups
471 dynamics = map (\g -> (g ^. phylo_groupMeta) ! "dynamics") groups
472 emerging = nubBy (\n1 n2 -> fst n1 == fst n2)
473 $ concat $ map (\g -> filter (\(_,d) -> d == 0) $ zip (fst g) (snd g)) $ zip ngrams dynamics
474 in reverse $ sortOn snd
475 $ map (\(n,_) -> if (member n freq)
476 then (n,freq ! n)
477 else (n,0)) emerging
478
479
480 mostEmergentTfIdf :: Int -> Map Int Double -> Vector Ngrams -> PhyloExport -> PhyloExport
481 mostEmergentTfIdf nth freq foundations export =
482 over ( export_branches
483 . traverse )
484 (\b ->
485 let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
486 tfidf = findTfIdf (map _phylo_groupNgrams groups)
487 emergences = findEmergences groups freq
488 selected = if (null emergences)
489 then map fst $ take nth tfidf
490 else [fst $ head' "mostEmergentTfIdf" emergences]
491 ++ (map fst $ take (nth - 1) $ filter (\(n,_) -> n /= (fst $ head' "mostEmergentTfIdf" emergences)) tfidf)
492 in b & branch_label .~ (ngramsToLabel foundations selected)) export
493
494
495 getNthMostMeta :: Int -> [Double] -> [Int] -> [Int]
496 getNthMostMeta nth meta ns = map (\(idx,_) -> (ns !! idx))
497 $ take nth
498 $ reverse
499 $ sortOn snd $ zip [0..] meta
500
501
502 mostInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
503 mostInclusive nth foundations export =
504 over ( export_branches
505 . traverse )
506 (\b ->
507 let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
508 cooc = foldl (\acc g -> unionWith (+) acc (g ^. phylo_groupCooc)) empty groups
509 ngrams = sort $ foldl (\acc g -> union acc (g ^. phylo_groupNgrams)) [] groups
510 inc = map (\n -> inclusion cooc (ngrams \\ [n]) n) ngrams
511 lbl = ngramsToLabel foundations $ getNthMostMeta nth inc ngrams
512 in b & branch_label .~ lbl ) export
513
514
515 mostEmergentInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
516 mostEmergentInclusive nth foundations export =
517 over ( export_groups
518 . traverse )
519 (\g ->
520 let lbl = ngramsToLabel foundations
521 $ take nth
522 $ map (\(_,(_,idx)) -> idx)
523 $ concat
524 $ map (\groups -> sortOn (fst . snd) groups)
525 $ groupBy ((==) `on` fst) $ reverse $ sortOn fst
526 $ zip ((g ^. phylo_groupMeta) ! "inclusion")
527 $ zip ((g ^. phylo_groupMeta) ! "dynamics") (g ^. phylo_groupNgrams)
528 in g & phylo_groupLabel .~ lbl ) export
529
530
531 processLabels :: [PhyloLabel] -> Vector Ngrams -> Map Int Double -> PhyloExport -> PhyloExport
532 processLabels labels foundations freq export =
533 foldl (\export' label ->
534 case label of
535 GroupLabel tagger nth ->
536 case tagger of
537 MostEmergentInclusive -> mostEmergentInclusive nth foundations export'
538 _ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger"
539 BranchLabel tagger nth ->
540 case tagger of
541 MostInclusive -> mostInclusive nth foundations export'
542 MostEmergentTfIdf -> mostEmergentTfIdf nth freq foundations export'
543 _ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger" ) export labels
544
545
546 ------------------
547 -- | Dynamics | --
548 ------------------
549
550 -- utiliser & creer une Map FdtId [PhyloGroup]
551 -- n = index of the current term
552 toDynamics :: FdtId -> [PhyloGroup] -> PhyloGroup -> Map FdtId (Date,Date) -> Double
553 toDynamics n elders g m =
554 let prd = g ^. phylo_groupPeriod
555 end = last' "dynamics" (sort $ map snd $ elems m)
556 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
557 {- decrease -}
558 then 2
559 else if ((fst prd) == (fst $ m ! n))
560 {- emerging -}
561 then 0
562 else if isNew
563 {- emergence -}
564 then 1
565 else 3
566 where
567 --------------------------------------
568 isNew :: Bool
569 isNew = not $ elem n $ concat $ map _phylo_groupNgrams elders
570
571 type FdtId = Int
572 processDynamics :: [PhyloGroup] -> [PhyloGroup]
573 processDynamics groups =
574 map (\g ->
575 let elders = filter (\g' -> (g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
576 && ((fst $ g ^. phylo_groupPeriod) > (fst $ g' ^. phylo_groupPeriod))) groups
577 in g & phylo_groupMeta %~ insert "dynamics" (map (\n -> toDynamics n elders g mapNgrams) $ g ^. phylo_groupNgrams) ) groups
578 where
579 --------------------------------------
580 mapNgrams :: Map FdtId (Date,Date)
581 mapNgrams = map (\dates ->
582 let dates' = sort dates
583 in (head' "dynamics" dates', last' "dynamics" dates'))
584 $ fromListWith (++)
585 $ foldl (\acc g -> acc ++ ( map (\n -> (n,[fst $ g ^. phylo_groupPeriod, snd $ g ^. phylo_groupPeriod]))
586 $ (g ^. phylo_groupNgrams))) [] groups
587
588
589 -----------------
590 -- | horizon | --
591 -----------------
592
593 getGroupThr :: Double -> PhyloGroup -> Double
594 getGroupThr step g =
595 let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
596 breaks = (g ^. phylo_groupMeta) ! "breaks"
597 in (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl)) - step
598
599 toAncestor :: Double -> Map Int Double -> PhyloSimilarity -> Double -> [PhyloGroup] -> PhyloGroup -> PhyloGroup
600 toAncestor nbDocs diago similarity step candidates ego =
601 let curr = ego ^. phylo_groupAncestors
602 in ego & phylo_groupAncestors .~ (curr ++ (map (\(g,w) -> (getGroupId g,w))
603 $ filter (\(g,w) -> (w > 0) && (w >= (min (getGroupThr step ego) (getGroupThr step g))))
604 $ map (\g -> (g, toSimilarity nbDocs diago similarity (ego ^. phylo_groupNgrams) (g ^. phylo_groupNgrams) (g ^. phylo_groupNgrams)))
605 $ filter (\g -> g ^. phylo_groupBranchId /= ego ^. phylo_groupBranchId ) candidates))
606
607
608 headsToAncestors :: Double -> Map Int Double -> PhyloSimilarity -> Double -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
609 headsToAncestors nbDocs diago similarity step heads acc =
610 if (null heads)
611 then acc
612 else
613 let ego = head' "headsToAncestors" heads
614 heads' = tail' "headsToAncestors" heads
615 in headsToAncestors nbDocs diago similarity step heads' (acc ++ [toAncestor nbDocs diago similarity step heads' ego])
616
617
618 toHorizon :: Phylo -> Phylo
619 toHorizon phylo =
620 let phyloAncestor = updatePhyloGroups
621 scale
622 (fromList $ map (\g -> (getGroupId g, g))
623 $ concat
624 $ tracePhyloAncestors newGroups) phylo
625 reBranched = fromList $ map (\g -> (getGroupId g, g)) $ concat
626 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g)) $ getGroupsFromScale scale phyloAncestor
627 in updatePhyloGroups scale reBranched phylo
628 where
629 -- | 1) for each periods
630 periods :: [Period]
631 periods = getPeriodIds phylo
632 -- --
633 scale :: Scale
634 scale = getLastLevel phylo
635 -- --
636 frame :: Int
637 frame = getTimeFrame $ timeUnit $ getConfig phylo
638 -- | 2) find ancestors between groups without parents
639 mapGroups :: [[PhyloGroup]]
640 mapGroups = map (\prd ->
641 let groups = getGroupsFromScalePeriods scale [prd] phylo
642 childs = getPreviousChildIds scale frame prd periods phylo
643 -- maybe add a better filter for non isolated ancestors
644 heads = filter (\g -> (not . null) $ (g ^. phylo_groupPeriodChilds))
645 $ filter (\g -> null (g ^. phylo_groupPeriodParents) && (notElem (getGroupId g) childs)) groups
646 noHeads = groups \\ heads
647 nbDocs = sum $ elems $ filterDocs (getDocsByDate phylo) [prd]
648 diago = reduceDiagos $ filterDiago (getCoocByDate phylo) [prd]
649 sim = (similarity $ getConfig phylo)
650 step = case getSeaElevation phylo of
651 Constante _ s -> s
652 Adaptative _ -> 0
653 Evolving _ -> 0
654 -- in headsToAncestors nbDocs diago Similarity heads groups []
655 in map (\ego -> toAncestor nbDocs diago sim step noHeads ego)
656 $ headsToAncestors nbDocs diago sim step heads []
657 ) periods
658 -- | 3) process this task concurrently
659 newGroups :: [[PhyloGroup]]
660 newGroups = mapGroups `using` parList rdeepseq
661 --------------------------------------
662
663 getPreviousChildIds :: Scale -> Int -> Period -> [Period] -> Phylo -> [PhyloGroupId]
664 getPreviousChildIds lvl frame curr prds phylo =
665 concat $ map ((map fst) . _phylo_groupPeriodChilds)
666 $ getGroupsFromScalePeriods lvl (getNextPeriods ToParents frame curr prds) phylo
667
668 ---------------------
669 -- | phyloExport | --
670 ---------------------
671
672 toPhyloExport :: Phylo -> DotGraph DotId
673 toPhyloExport phylo = exportToDot phylo
674 $ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
675 $ processSort (exportSort $ getConfig phylo) (getSeaElevation phylo)
676 $ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) (getLastRootsFreq phylo)
677 $ processMetrics phylo export
678 where
679 export :: PhyloExport
680 export = PhyloExport groups branches
681 --------------------------------------
682 branches :: [PhyloBranch]
683 branches = map (\g ->
684 let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
685 breaks = (g ^. phylo_groupMeta) ! "breaks"
686 canonId = take (round $ (last' "export" breaks) + 2) (snd $ g ^. phylo_groupBranchId)
687 in PhyloBranch (g ^. phylo_groupBranchId)
688 canonId
689 seaLvl
690 0
691 (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl))
692 0
693 0
694 "" empty)
695 $ map (\gs -> head' "export" gs)
696 $ groupBy (\g g' -> g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
697 $ sortOn (\g -> g ^. phylo_groupBranchId) groups
698 --------------------------------------
699 groups :: [PhyloGroup]
700 groups = traceExportGroups
701 -- necessaire ?
702 $ processDynamics
703 $ getGroupsFromScale (phyloScale $ getConfig phylo)
704 $ tracePhyloInfo phylo
705
706
707 traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
708 traceExportBranches branches = trace ("\n"
709 <> "-- | Export " <> show(length branches) <> " branches") branches
710
711 tracePhyloAncestors :: [[PhyloGroup]] -> [[PhyloGroup]]
712 tracePhyloAncestors groups = trace ( "-- | Found " <> show(length $ concat $ map _phylo_groupAncestors $ concat groups) <> " ancestors") groups
713
714 tracePhyloInfo :: Phylo -> Phylo
715 tracePhyloInfo phylo = trace ("\n" <> "##########################" <> "\n\n" <> "-- | Phylo with λ = "
716 <> show(getLevel phylo) <> " applied to "
717 <> show(length $ Vector.toList $ getRoots phylo) <> " foundations"
718 ) phylo
719
720
721 traceExportGroups :: [PhyloGroup] -> [PhyloGroup]
722 traceExportGroups groups = trace ("\n" <> "-- | Export "
723 <> show(length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups) <> " branches, "
724 <> show(length groups) <> " groups and "
725 <> show(length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) groups) <> " terms"
726 ) groups
727