]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
add the phyloBase, Fis and Cooc
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++))
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, mapWithKey, toList, elems)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Text.Context (TermList)
27 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
28
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens
33
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
36
37
38 ------------------
39 -- | To Phylo | --
40 ------------------
41
42
43 toPhylo :: [Document] -> TermList -> Config -> Phylo
44 toPhylo docs lst conf = phyloBase
45 where
46 --------------------------------------
47 _phylo1 :: Phylo
48 _phylo1 = toPhylo1 docs phyloBase
49 --------------------------------------
50 phyloBase :: Phylo
51 phyloBase = toPhyloBase docs lst conf
52 --------------------------------------
53
54
55
56 --------------------
57 -- | To Phylo 1 | --
58 --------------------
59
60
61 toPhylo1 :: [Document] -> Phylo -> Phylo
62 toPhylo1 docs phyloBase = undefined
63 where
64 --------------------------------------
65 _mFis :: Map (Date,Date) [PhyloFis]
66 _mFis = toPhyloFis _docs' (fisSupport $ getConfig phyloBase) (fisSize $ getConfig phyloBase)
67 --------------------------------------
68 _docs' :: Map (Date,Date) [Document]
69 _docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
70 --------------------------------------
71
72
73 ---------------------------
74 -- | Frequent Item Set | --
75 ---------------------------
76
77
78 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
79 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
80 filterFis keep thr f m = case keep of
81 False -> map (\l -> f thr l) m
82 True -> map (\l -> keepFilled (f) thr l) m
83
84
85 -- | To filter Fis with small Support
86 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
87 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
88
89
90 -- | To filter Fis with small Clique size
91 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
92 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
93
94
95 -- | To filter nested Fis
96 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
97 filterFisByNested m =
98 let fis = map (\l ->
99 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
100 then mem
101 else
102 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
103 in fMax ++ [f] ) [] l)
104 $ elems m
105 fis' = fis `using` parList rdeepseq
106 in fromList $ zip (keys m) fis'
107
108
109 -- | To transform a time map of docs innto a time map of Fis with some filters
110 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
111 toPhyloFis mDocs support clique = traceFis "Filtered Fis"
112 $ filterFisByNested
113 $ traceFis "Filtered by clique size"
114 $ filterFis True clique (filterFisByClique)
115 $ traceFis "Filtered by support"
116 $ filterFis True support (filterFisBySupport)
117 $ traceFis "Unfiltered Fis" mFis
118 where
119 --------------------------------------
120 -- | create the fis from the docs for each period
121 mFis :: Map (Date,Date) [PhyloFis]
122 mFis = mapWithKey (\prd docs -> let fis = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
123 in map (\f -> PhyloFis (fst f) (snd f) prd) fis ) mDocs
124 --------------------------------------
125
126
127 --------------------
128 -- | Coocurency | --
129 --------------------
130
131
132 -- | To transform the docs into a time map of coocurency matrix
133 docsToCoocByYear :: [Document] -> Vector Ngrams -> Config -> Map Date Cooc
134 docsToCoocByYear docs fdt conf =
135 let mCooc = fromListWith sumCooc
136 $ map (\(_d,l) -> (_d, listToMatrix l))
137 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
138 mCooc' = fromList
139 $ map (\t -> (t,empty))
140 $ toTimeScale (map date docs) (timeUnit conf)
141 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
142 $ unionWith sumCooc mCooc mCooc'
143
144
145 -----------------------
146 -- | to Phylo Base | --
147 -----------------------
148
149
150 -- | To group a list of Documents by fixed periods
151 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
152 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
153 groupDocsByPeriod f pds es =
154 let periods = map (inPeriode f es) pds
155 periods' = periods `using` parList rdeepseq
156
157 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
158 $ fromList $ zip pds periods'
159 where
160 --------------------------------------
161 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
162 inPeriode f' h (start,end) =
163 fst $ partition (\d -> f' d >= start && f' d <= end) h
164 --------------------------------------
165
166
167 -- | To count the number of docs by unit of time (like a year)
168 nbDocsByTime :: [Document] -> Int -> Map Date Double
169 nbDocsByTime docs step =
170 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
171 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') step
172 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
173 $ unionWith (+) time docs'
174
175
176 -- | To init the basic elements of a Phylo
177 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
178 toPhyloBase docs lst conf =
179 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
180 params = defaultPhyloParam { _phyloParam_config = conf }
181 periods = toPeriods (sort $ nub $ map date docs) (timePeriod conf) (timeStep conf)
182 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
183 $ Phylo foundations
184 (docsToCoocByYear docs (foundations ^. foundations_roots) conf)
185 (nbDocsByTime docs $ timeUnit conf)
186 params
187 (map (\prd -> PhyloPeriod prd []) periods)