]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Aggregates/Cooc.hs
Merge branch 'dev-phylo' into dev-merge
[gargantext.git] / src / Gargantext / Viz / Phylo / Aggregates / Cooc.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.Aggregates.Cooc
18 where
19
20 import Data.List (union,concat,nub,sort, sortOn)
21 import Data.Map (Map,elems,adjust,filterWithKey,fromListWith,fromList,restrictKeys)
22 import Data.Set (Set)
23 import Data.Vector (Vector)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import qualified Data.Map as Map
28 import qualified Data.Set as Set
29
30 -- import Debug.Trace (trace)
31
32 -- | To transform the Fis into a full coocurency Matrix in a Phylo
33 fisToCooc :: Map (Date, Date) [PhyloFis] -> Phylo -> Map (Int, Int) Double
34 fisToCooc m p = map (/docs)
35 $ foldl (\mem x -> adjust (+1) x mem) cooc
36 $ concat
37 $ map (\x -> listToDirectedCombiWith (\y -> getIdxInRoots y p) $ (Set.toList . getClique) x)
38 $ (concat . elems) m
39 where
40 --------------------------------------
41 fisNgrams :: [Ngrams]
42 fisNgrams = foldl (\mem x -> union mem $ (Set.toList . getClique) x) [] $ (concat . elems) m
43 --------------------------------------
44 docs :: Double
45 docs = fromIntegral $ foldl (\mem x -> mem + (getSupport x)) 0 $ (concat . elems) m
46 --------------------------------------
47 cooc :: Map (Int, Int) (Double)
48 cooc = Map.fromList $ map (\x -> (x,0)) (listToDirectedCombiWith (\y -> getIdxInRoots y p) fisNgrams)
49 --------------------------------------
50
51
52
53 -- | To transform a tuple of group's information into a coocurency Matrix
54 toCooc :: [([Int],Double)] -> Map (Int, Int) Double
55 toCooc l = map (/docs)
56 $ foldl (\mem x -> adjust (+1) x mem) cooc
57 $ concat
58 $ map (\x -> listToFullCombi $ fst x) l
59 where
60 --------------------------------------
61 idx :: [Int]
62 idx = nub $ concat $ map fst l
63 --------------------------------------
64 docs :: Double
65 docs = sum $ map snd l
66 --------------------------------------
67 cooc :: Map (Int, Int) (Double)
68 cooc = Map.fromList $ map (\x -> (x,0)) $ listToFullCombi idx
69 --------------------------------------
70
71
72 -- | To reduce a coocurency Matrix to some keys
73 getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
74 getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
75 && (elem (snd k) idx)) cooc
76
77
78 -- | To get a coocurency Matrix related to a given list of Periods
79 getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
80 getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
81 where
82 --------------------------------------
83 -- | Here we need to go back to the level 1 (aka : the Fis level)
84 gs :: [PhyloGroup]
85 gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
86 --------------------------------------
87
88
89
90
91
92 -- | To transform a list of index into a cooc matrix
93 listToCooc :: [Int] -> Map (Int,Int) Double
94 listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
95
96
97 -- | To transform a list of ngrams into a list of indexes
98 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
99 ngramsToIdx ns v = sort $ map (\n -> getIdxInVector n v) ns
100
101
102 -- | To build the cooc matrix by years out of the corpus
103 docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
104 docsToCooc docs fdt = fromListWith sumCooc
105 $ map (\(d,l) -> (d, listToCooc l))
106 $ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
107
108
109 -- | To sum all the docs produced during a list of years
110 sumDocsByYears :: Set Date -> Map Date Double -> Double
111 sumDocsByYears years m = sum $ elems $ restrictKeys m years
112
113
114 -- | To get the cooc matrix of a group
115 groupToCooc :: PhyloGroup -> Phylo -> Map (Int,Int) Double
116 groupToCooc g p = getMiniCooc (listToFullCombi $ getGroupNgrams g) (periodsToYears [getGroupPeriod g]) (getPhyloCooc p)
117
118
119 -- | To get the union of the cooc matrix of two groups
120 unionOfCooc :: PhyloGroup -> PhyloGroup -> Phylo -> Map (Int,Int) Double
121 unionOfCooc g g' p = sumCooc (groupToCooc g p) (groupToCooc g' p)
122
123
124 -- | To get the nth most occurent elems in a coocurency matrix
125 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
126 getNthMostOcc nth cooc = (nub . concat)
127 $ map (\((idx,idx'),_) -> [idx,idx'])
128 $ take nth
129 $ reverse
130 $ sortOn snd $ Map.toList cooc
131
132
133 -- phyloCooc :: Map (Int, Int) Double
134 -- phyloCooc = fisToCooc phyloFis phylo1_0_1