2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Viz.Phylo.Aggregates.Cooc
20 import Data.List (union,concat,nub,sort, sortOn)
21 import Data.Map (Map,elems,adjust,filterWithKey,fromListWith,fromList,restrictKeys)
23 import Data.Vector (Vector)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import qualified Data.Map as Map
28 import qualified Data.Set as Set
30 -- import Debug.Trace (trace)
32 -- | To transform the Fis into a full coocurency Matrix in a Phylo
33 fisToCooc :: Map (Date, Date) [PhyloFis] -> Phylo -> Map (Int, Int) Double
34 fisToCooc m p = map (/docs)
35 $ foldl (\mem x -> adjust (+1) x mem) cooc
37 $ map (\x -> listToDirectedCombiWith (\y -> getIdxInRoots y p) $ (Set.toList . getClique) x)
40 --------------------------------------
42 fisNgrams = foldl (\mem x -> union mem $ (Set.toList . getClique) x) [] $ (concat . elems) m
43 --------------------------------------
45 docs = fromIntegral $ foldl (\mem x -> mem + (getSupport x)) 0 $ (concat . elems) m
46 --------------------------------------
47 cooc :: Map (Int, Int) (Double)
48 cooc = Map.fromList $ map (\x -> (x,0)) (listToDirectedCombiWith (\y -> getIdxInRoots y p) fisNgrams)
49 --------------------------------------
53 -- | To transform a tuple of group's information into a coocurency Matrix
54 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
55 toCooc l = map (/docs)
56 $ foldl (\mem x -> adjust (+1) x mem) cooc
58 $ map (\x -> listToFullCombi $ fst x) l
60 --------------------------------------
62 idx = nub $ concat $ map fst l
63 --------------------------------------
65 docs = sum $ map snd l
66 --------------------------------------
67 cooc :: Map (Int, Int) (Double)
68 cooc = Map.fromList $ map (\x -> (x,0)) $ listToFullCombi idx
69 --------------------------------------
72 -- | To reduce a coocurency Matrix to some keys
73 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
74 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
75 && (elem (snd k) idx)) cooc
78 -- | To get a coocurency Matrix related to a given list of Periods
79 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
80 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
82 --------------------------------------
83 -- | Here we need to go back to the level 1 (aka : the Fis level)
85 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
86 --------------------------------------
92 -- | To transform a list of index into a cooc matrix
93 listToCooc :: [Int] -> Map (Int,Int) Double
94 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
97 -- | To transform a list of ngrams into a list of indexes
98 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
99 ngramsToIdx ns v = sort $ map (\n -> getIdxInVector n v) ns
102 -- | To build the cooc matrix by years out of the corpus
103 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
104 docsToCooc docs fdt = fromListWith sumCooc
105 $ map (\(d,l) -> (d, listToCooc l))
106 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
109 -- | To sum all the docs produced during a list of years
110 sumDocsByYears :: Set Date -> Map Date Double -> Double
111 sumDocsByYears years m = sum $ elems $ restrictKeys m years
114 -- | To get the cooc matrix of a group
115 groupToCooc :: PhyloGroup -> Phylo -> Map (Int,Int) Double
116 groupToCooc g p = getMiniCooc (listToFullCombi $ getGroupNgrams g) (periodsToYears [getGroupPeriod g]) (getPhyloCooc p)
119 -- | To get the union of the cooc matrix of two groups
120 unionOfCooc :: PhyloGroup -> PhyloGroup -> Phylo -> Map (Int,Int) Double
121 unionOfCooc g g' p = sumCooc (groupToCooc g p) (groupToCooc g' p)
124 -- | To get the nth most occurent elems in a coocurency matrix
125 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
126 getNthMostOcc nth cooc = (nub . concat)
127 $ map (\((idx,idx'),_) -> [idx,idx'])
130 $ sortOn snd $ Map.toList cooc
133 -- phyloCooc :: Map (Int, Int) Double
134 -- phyloCooc = fisToCooc phyloFis phylo1_0_1