module Gargantext.Core.Viz.Phylo.PhyloExport where
-import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList)
-import Data.List ((++), sort, nub, concat, sortOn, reverse, groupBy, union, (\\), (!!), init, partition, unwords, nubBy, inits, elemIndex)
+import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList, member)
+import Data.List ((++), sort, nub, null, concat, sortOn, groupBy, union, (\\), (!!), init, partition, notElem, unwords, nubBy, inits, elemIndex)
import Data.Vector (Vector)
import Prelude (writeFile)
import Gargantext.Prelude
import Gargantext.Core.Viz.AdaptativePhylo
import Gargantext.Core.Viz.Phylo.PhyloTools
+import Gargantext.Core.Viz.Phylo.TemporalMatching (filterDocs, filterDiago, reduceDiagos, toProximity, getNextPeriods)
-import Control.Lens
+import Control.Lens hiding (Level)
+import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.GraphViz hiding (DotGraph, Order)
import Data.GraphViz.Types.Generalised (DotGraph)
import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
import System.FilePath
import Debug.Trace (trace)
-import qualified Data.Map as Map
import qualified Data.Text as Text
import qualified Data.Vector as Vector
import qualified Data.Text.Lazy as Lazy
, toAttr "label" (pack $ show $ b ^. branch_label)
])
-periodToDotNode :: (Date,Date) -> Dot DotId
-periodToDotNode prd =
+periodToDotNode :: (Date,Date) -> (Text.Text,Text.Text) -> Dot DotId
+periodToDotNode prd prd' =
node (periodIdToDotId prd)
([Shape BoxShape, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
- <> [ toAttr "nodeType" "period"
+ <> [ toAttr "nodeType" "period"
+ , toAttr "strFrom" (fromStrict $ Text.pack $ (show $ fst prd'))
+ , toAttr "strTo" (fromStrict $ Text.pack $ (show $ snd prd'))
, toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
, toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
<> [ toAttr "nodeType" "group"
, toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
, toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
+ , toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
+ , toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "bId" (pack $ show bId)
- , toAttr "support" (pack $ show (g ^. phylo_groupSupport))])
+ , toAttr "support" (pack $ show (g ^. phylo_groupSupport))
+ , toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
+ , toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
+ , toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
+ , toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
+ , toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
+ , toAttr "frequence" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "frequence")))
+ ])
-toDotEdge :: DotId -> DotId -> Text.Text -> EdgeType -> Dot DotId
+toDotEdge :: DotId -> DotId -> [Char] -> EdgeType -> Dot DotId
toDotEdge source target lbl edgeType = edge source target
(case edgeType of
- GroupToGroup -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True
- , Label (StrLabel $ fromStrict lbl)] <> [toAttr "edgeType" "link" ]
- BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])
- , Label (StrLabel $ fromStrict lbl)] <> [toAttr "edgeType" "branchLink" ]
- BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])
- , Label (StrLabel $ fromStrict lbl)]
- PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
+ GroupToGroup -> [ Width 3, penWidth 4, Color [toWColor Black], Constraint True] <> [toAttr "edgeType" "link", toAttr "lbl" (pack lbl)]
+ BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])] <> [toAttr "edgeType" "branchLink" ]
+ BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])]
+ GroupToAncestor -> [ Width 3, Color [toWColor Red], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,NoArrow)]), PenWidth 4] <> [toAttr "edgeType" "ancestorLink", toAttr "lbl" (pack lbl)]
+ PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
mergePointers :: [PhyloGroup] -> Map (PhyloGroupId,PhyloGroupId) Double
toParents = fromList $ concat $ map (\g -> map (\(target,w) -> ((target,getGroupId g),w)) $ g ^. phylo_groupPeriodParents) groups
in unionWith (\w w' -> max w w') toChilds toParents
+mergeAncestors :: [PhyloGroup] -> [((PhyloGroupId,PhyloGroupId), Double)]
+mergeAncestors groups = concat
+ $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupAncestors)
+ $ filter (\g -> (not . null) $ g ^. phylo_groupAncestors) groups
+
toBid :: PhyloGroup -> [PhyloBranch] -> Int
toBid g bs =
,(toAttr (fromStrict "phyloPeriods") $ pack $ show (length $ elems $ phylo ^. phylo_periods))
,(toAttr (fromStrict "phyloBranches") $ pack $ show (length $ export ^. export_branches))
,(toAttr (fromStrict "phyloGroups") $ pack $ show (length $ export ^. export_groups))
+ ,(toAttr (fromStrict "phyloSources") $ pack $ show (Vector.toList $ getSources phylo))
+ ,(toAttr (fromStrict "phyloTimeScale") $ pack $ getTimeScale phylo)
+ -- ,(toAttr (fromStrict "phyloTermsFreq") $ pack $ show (toList $ _phylo_lastTermFreq phylo))
])
{-
-- 2) create a layer for the branches labels -}
subgraph (Str "Branches peaks") $ do
- graphAttrs [Rank SameRank]
+ -- graphAttrs [Rank SameRank]
{-
-- 3) group the branches by hierarchy
-- mapM (\branches ->
{-- 5) create a layer for each period -}
_ <- mapM (\period ->
- subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst period) <> show (snd period))) $ do
+ subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst $ _phylo_periodPeriod period) <> show (snd $ _phylo_periodPeriod period))) $ do
graphAttrs [Rank SameRank]
- periodToDotNode period
+ periodToDotNode (period ^. phylo_periodPeriod) (period ^. phylo_periodPeriod')
{-- 6) create a node for each group -}
- mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == period) $ export ^. export_groups)
- ) $ getPeriodIds phylo
+ mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == (period ^. phylo_periodPeriod)) $ export ^. export_groups)
+ ) $ phylo ^. phylo_periods
{-- 7) create the edges between a branch and its first groups -}
_ <- mapM (\(bId,groups) ->
$ fromListWith (++) $ map (\g -> (g ^. phylo_groupBranchId,[g])) $ export ^. export_groups
{- 8) create the edges between the groups -}
- _ <- mapM (\((k,k'),_) ->
- toDotEdge (groupIdToDotId k) (groupIdToDotId k') "" GroupToGroup
+ _ <- mapM (\((k,k'),v) ->
+ toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToGroup
) $ (toList . mergePointers) $ export ^. export_groups
- {- 7) create the edges between the periods -}
+ _ <- mapM (\((k,k'),v) ->
+ toDotEdge (groupIdToDotId k) (groupIdToDotId k') (show v) GroupToAncestor
+ ) $ mergeAncestors $ export ^. export_groups
+
+ -- 10) create the edges between the periods
_ <- mapM (\(prd,prd') ->
toDotEdge (periodIdToDotId prd) (periodIdToDotId prd') "" PeriodToPeriod
) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ getPeriodIds phylo
filterByBranchSize :: Double -> PhyloExport -> PhyloExport
filterByBranchSize thr export =
- let branches' = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
- in export & export_branches .~ (fst branches')
- & export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd branches')))
+ let splited = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
+ in export & export_branches .~ (fst splited)
+ & export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd splited)))
processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
$ zip branches
$ ([0] ++ (map (\(b,b') ->
let idx = length $ commonPrefix (b ^. branch_canonId) (b' ^. branch_canonId) []
- in (b' ^. branch_seaLevel) !! (idx - 1)
+ lmin = min (length $ b ^. branch_seaLevel) (length $ b' ^. branch_seaLevel)
+ in
+ if ((idx - 1) > ((length $ b' ^. branch_seaLevel) - 1))
+ then (b' ^. branch_seaLevel) !! (lmin - 1)
+ else (b' ^. branch_seaLevel) !! (idx - 1)
) $ listToSeq branches))
in map (\(x,b) -> b & branch_x .~ x)
$ zip steps branches
+branchToIso' :: Double -> Double -> [PhyloBranch] -> [PhyloBranch]
+branchToIso' start step branches =
+ let bx = map (\l -> (sum l) + ((fromIntegral $ length l) * 0.5))
+ $ inits
+ $ ([0] ++ (map (\(b,b') ->
+ let root = fromIntegral $ length $ commonPrefix (snd $ b ^. branch_id) (snd $ b' ^. branch_id) []
+ in 1 - start - step * root) $ listToSeq branches))
+ in map (\(x,b) -> b & branch_x .~ x)
+ $ zip bx branches
+
sortByHierarchy :: Int -> [PhyloBranch] -> [PhyloBranch]
sortByHierarchy depth branches =
if (length branches == 1)
- then branchToIso branches
- else branchToIso $ concat
+ then branches
+ else concat
$ map (\branches' ->
let partitions = partition (\b -> depth + 1 == ((length . snd) $ b ^. branch_id)) branches'
in (sortOn (\b -> (b ^. branch_meta) ! "birth") (fst partitions))
Desc -> reverse branches
in export & export_branches .~ branches'
-processSort :: Sort -> PhyloExport -> PhyloExport
-processSort sort' export = case sort' of
+processSort :: Sort -> SeaElevation -> PhyloExport -> PhyloExport
+processSort sort' elev export = case sort' of
ByBirthDate o -> sortByBirthDate o export
- ByHierarchy -> export & export_branches .~ sortByHierarchy 0 (export ^. export_branches)
+ ByHierarchy -> export & export_branches .~ (branchToIso' (_cons_start elev) (_cons_step elev)
+ $ sortByHierarchy 0 (export ^. export_branches))
-----------------
+ (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
-ngramsMetrics :: PhyloExport -> PhyloExport
-ngramsMetrics export =
+ngramsMetrics :: Phylo -> PhyloExport -> PhyloExport
+ngramsMetrics phylo export =
over ( export_groups
. traverse )
(\g -> g & phylo_groupMeta %~ insert "genericity"
(map (\n -> specificity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
& phylo_groupMeta %~ insert "inclusion"
(map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
+ & phylo_groupMeta %~ insert "frequence"
+ (map (\n -> getInMap n (phylo ^. phylo_lastTermFreq)) $ g ^. phylo_groupNgrams)
) export
& branch_meta %~ insert "age" [fromIntegral age]
& branch_meta %~ insert "size" [fromIntegral $ length periods] ) export
-processMetrics :: PhyloExport -> PhyloExport
-processMetrics export = ngramsMetrics
- $ branchDating export
+processMetrics :: Phylo -> PhyloExport -> PhyloExport
+processMetrics phylo export = ngramsMetrics phylo
+ $ branchDating export
-----------------
-- | Taggers | --
-----------------
+nk :: Int -> [[Int]] -> Int
+nk n groups = sum
+ $ map (\g -> if (elem n g)
+ then 1
+ else 0) groups
+
+
+tf :: Int -> [[Int]] -> Double
+tf n groups = (fromIntegral $ nk n groups) / (fromIntegral $ length $ concat groups)
+
+
+idf :: Int -> [[Int]] -> Double
+idf n groups = log ((fromIntegral $ length groups) / (fromIntegral $ nk n groups))
+
+
+findTfIdf :: [[Int]] -> [(Int,Double)]
+findTfIdf groups = reverse $ sortOn snd $ map (\n -> (n,(tf n groups) * (idf n groups))) $ sort $ nub $ concat groups
+
+
+findEmergences :: [PhyloGroup] -> Map Int Double -> [(Int,Double)]
+findEmergences groups freq =
+ let ngrams = map _phylo_groupNgrams groups
+ dynamics = map (\g -> (g ^. phylo_groupMeta) ! "dynamics") groups
+ emerging = nubBy (\n1 n2 -> fst n1 == fst n2)
+ $ concat $ map (\g -> filter (\(_,d) -> d == 0) $ zip (fst g) (snd g)) $ zip ngrams dynamics
+ in reverse $ sortOn snd
+ $ map (\(n,_) -> if (member n freq)
+ then (n,freq ! n)
+ else (n,0)) emerging
+
+
+mostEmergentTfIdf :: Int -> Map Int Double -> Vector Ngrams -> PhyloExport -> PhyloExport
+mostEmergentTfIdf nth freq foundations export =
+ over ( export_branches
+ . traverse )
+ (\b ->
+ let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
+ tfidf = findTfIdf (map _phylo_groupNgrams groups)
+ emergences = findEmergences groups freq
+ selected = if (null emergences)
+ then map fst $ take nth tfidf
+ else [fst $ head' "mostEmergentTfIdf" emergences]
+ ++ (map fst $ take (nth - 1) $ filter (\(n,_) -> n /= (fst $ head' "mostEmergentTfIdf" emergences)) tfidf)
+ in b & branch_label .~ (ngramsToLabel foundations selected)) export
+
+
getNthMostMeta :: Int -> [Double] -> [Int] -> [Int]
getNthMostMeta nth meta ns = map (\(idx,_) -> (ns !! idx))
$ take nth
in g & phylo_groupLabel .~ lbl ) export
-processLabels :: [PhyloLabel] -> Vector Ngrams -> PhyloExport -> PhyloExport
-processLabels labels foundations export =
+processLabels :: [PhyloLabel] -> Vector Ngrams -> Map Int Double -> PhyloExport -> PhyloExport
+processLabels labels foundations freq export =
foldl (\export' label ->
case label of
GroupLabel tagger nth ->
BranchLabel tagger nth ->
case tagger of
MostInclusive -> mostInclusive nth foundations export'
+ MostEmergentTfIdf -> mostEmergentTfIdf nth freq foundations export'
_ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger" ) export labels
{- decrease -}
then 2
else if ((fst prd) == (fst $ m ! n))
- {- recombination -}
+ {- emerging -}
then 0
else if isNew
{- emergence -}
-- | horizon | --
-----------------
-horizonToAncestors :: Double -> Phylo -> [PhyloAncestor]
-horizonToAncestors delta phylo =
- let horizon = Map.toList $ Map.filter (\v -> v > delta) $ phylo ^. phylo_horizon
- ct0 = fromList $ map (\g -> (getGroupId g, g)) $ getGroupsFromLevelPeriods 1 (take 1 (getPeriodIds phylo)) phylo
- aDelta = toRelatedComponents
- (elems ct0)
- (map (\((g,g'),v) -> ((ct0 ! g,ct0 ! g'),v)) horizon)
- in map (\(id,groups) -> toAncestor id groups) $ zip [1..] aDelta
- where
- -- | note : possible bug if we sync clus more than once
- -- | horizon is calculated at level 1, ancestors have to be related to the last level
- toAncestor :: Int -> [PhyloGroup] -> PhyloAncestor
- toAncestor id groups = PhyloAncestor id
- (foldl' (\acc g -> union acc (g ^. phylo_groupNgrams)) [] groups)
- (concat $ map (\g -> map fst (g ^. phylo_groupLevelParents)) groups)
-
+getGroupThr :: Double -> PhyloGroup -> Double
+getGroupThr step g =
+ let seaLvl = (g ^. phylo_groupMeta) ! "seaLevels"
+ breaks = (g ^. phylo_groupMeta) ! "breaks"
+ in (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl)) - step
+
+toAncestor :: Double -> Map Int Double -> Proximity -> Double -> [PhyloGroup] -> PhyloGroup -> PhyloGroup
+toAncestor nbDocs diago proximity step candidates ego =
+ let curr = ego ^. phylo_groupAncestors
+ in ego & phylo_groupAncestors .~ (curr ++ (map (\(g,w) -> (getGroupId g,w))
+ $ filter (\(g,w) -> (w > 0) && (w >= (min (getGroupThr step ego) (getGroupThr step g))))
+ $ map (\g -> (g, toProximity nbDocs diago proximity (ego ^. phylo_groupNgrams) (g ^. phylo_groupNgrams) (g ^. phylo_groupNgrams)))
+ $ filter (\g -> g ^. phylo_groupBranchId /= ego ^. phylo_groupBranchId ) candidates))
+
+
+headsToAncestors :: Double -> Map Int Double -> Proximity -> Double -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
+headsToAncestors nbDocs diago proximity step heads acc =
+ if (null heads)
+ then acc
+ else
+ let ego = head' "headsToAncestors" heads
+ heads' = tail' "headsToAncestors" heads
+ in headsToAncestors nbDocs diago proximity step heads' (acc ++ [toAncestor nbDocs diago proximity step heads' ego])
+
+
+toHorizon :: Phylo -> Phylo
+toHorizon phylo =
+ let phyloAncestor = updatePhyloGroups
+ level
+ (fromList $ map (\g -> (getGroupId g, g))
+ $ concat
+ $ tracePhyloAncestors newGroups) phylo
+ reBranched = fromList $ map (\g -> (getGroupId g, g)) $ concat
+ $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g)) $ getGroupsFromLevel level phyloAncestor
+ in updatePhyloGroups level reBranched phylo
+ where
+ -- | 1) for each periods
+ periods :: [PhyloPeriodId]
+ periods = getPeriodIds phylo
+ -- --
+ level :: Level
+ level = getLastLevel phylo
+ -- --
+ frame :: Int
+ frame = getTimeFrame $ timeUnit $ getConfig phylo
+ -- | 2) find ancestors between groups without parents
+ mapGroups :: [[PhyloGroup]]
+ mapGroups = map (\prd ->
+ let groups = getGroupsFromLevelPeriods level [prd] phylo
+ childs = getPreviousChildIds level frame prd periods phylo
+ -- maybe add a better filter for non isolated ancestors
+ heads = filter (\g -> (not . null) $ (g ^. phylo_groupPeriodChilds))
+ $ filter (\g -> null (g ^. phylo_groupPeriodParents) && (notElem (getGroupId g) childs)) groups
+ noHeads = groups \\ heads
+ nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) [prd]
+ diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) [prd]
+ proximity = (phyloProximity $ getConfig phylo)
+ step = case getSeaElevation phylo of
+ Constante _ s -> s
+ Adaptative _ -> undefined
+ -- in headsToAncestors nbDocs diago proximity heads groups []
+ in map (\ego -> toAncestor nbDocs diago proximity step noHeads ego)
+ $ headsToAncestors nbDocs diago proximity step heads []
+ ) periods
+ -- | 3) process this task concurrently
+ newGroups :: [[PhyloGroup]]
+ newGroups = mapGroups `using` parList rdeepseq
+ --------------------------------------
+
+getPreviousChildIds :: Level -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> Phylo -> [PhyloGroupId]
+getPreviousChildIds lvl frame curr prds phylo =
+ concat $ map ((map fst) . _phylo_groupPeriodChilds)
+ $ getGroupsFromLevelPeriods lvl (getNextPeriods ToParents frame curr prds) phylo
---------------------
-- | phyloExport | --
toPhyloExport :: Phylo -> DotGraph DotId
toPhyloExport phylo = exportToDot phylo
$ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
- $ processSort (exportSort $ getConfig phylo)
- $ processLabels (exportLabel $ getConfig phylo) (getRoots phylo)
- $ processMetrics export
+ $ processSort (exportSort $ getConfig phylo) (getSeaElevation phylo)
+ $ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) (_phylo_lastTermFreq phylo)
+ $ processMetrics phylo export
where
export :: PhyloExport
- export = PhyloExport groups branches (horizonToAncestors 0 phylo)
+ export = PhyloExport groups branches
--------------------------------------
branches :: [PhyloBranch]
branches = map (\g ->
$ processDynamics
$ getGroupsFromLevel (phyloLevel $ getConfig phylo)
$ tracePhyloInfo phylo
+ -- \$ toHorizon phylo
traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
traceExportBranches branches = trace ("\n"
<> "-- | Export " <> show(length branches) <> " branches") branches
+tracePhyloAncestors :: [[PhyloGroup]] -> [[PhyloGroup]]
+tracePhyloAncestors groups = trace ( "-- | Found " <> show(length $ concat $ map _phylo_groupAncestors $ concat groups) <> " ancestors") groups
+
tracePhyloInfo :: Phylo -> Phylo
-tracePhyloInfo phylo = trace ("\n" <> "##########################" <> "\n\n" <> "-- | Phylo with �� = "
+tracePhyloInfo phylo = trace ("\n" <> "##########################" <> "\n\n" <> "-- | Phylo with �� = "
<> show(_qua_granularity $ phyloQuality $ getConfig phylo) <> " applied to "
<> show(length $ Vector.toList $ getRoots phylo) <> " foundations"
) phylo