]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
working on perf
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 , _wlj_thresholdInit :: Double
65 , _wlj_thresholdStep :: Double }
66 | Hamming
67 deriving (Show,Generic,Eq)
68
69
70 data Synchrony =
71 ByProximityThreshold
72 { _bpt_threshold :: Double
73 , _bpt_sensibility :: Double}
74 | ByProximityDistribution
75 deriving (Show,Generic,Eq)
76
77
78 data TimeUnit =
79 Year
80 { _year_period :: Int
81 , _year_step :: Int
82 , _year_matchingFrame :: Int }
83 deriving (Show,Generic,Eq)
84
85
86 data ContextualUnit =
87 Fis
88 { _fis_support :: Int
89 , _fis_size :: Int }
90 deriving (Show,Generic,Eq)
91
92
93 data Config =
94 Config { corpusPath :: FilePath
95 , listPath :: FilePath
96 , outputPath :: FilePath
97 , corpusParser :: CorpusParser
98 , phyloName :: Text
99 , phyloLevel :: Int
100 , phyloProximity :: Proximity
101 , phyloSynchrony :: Synchrony
102 , timeUnit :: TimeUnit
103 , contextualUnit :: ContextualUnit
104 , exportLabel :: [PhyloLabel]
105 , exportSort :: Sort
106 , exportFilter :: [Filter]
107 } deriving (Show,Generic,Eq)
108
109
110 defaultConfig :: Config
111 defaultConfig =
112 Config { corpusPath = ""
113 , listPath = ""
114 , outputPath = ""
115 , corpusParser = Csv 1000
116 , phyloName = pack "Default Phylo"
117 , phyloLevel = 1
118 , phyloProximity = WeightedLogJaccard 10 0 0.1
119 , phyloSynchrony = ByProximityThreshold 0.4 0
120 , timeUnit = Year 3 1 5
121 , contextualUnit = Fis 2 4
122 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
123 , exportSort = ByHierarchy
124 , exportFilter = [ByBranchSize 2]
125 }
126
127 instance FromJSON Config
128 instance ToJSON Config
129 instance FromJSON CorpusParser
130 instance ToJSON CorpusParser
131 instance FromJSON Proximity
132 instance ToJSON Proximity
133 instance FromJSON TimeUnit
134 instance ToJSON TimeUnit
135 instance FromJSON ContextualUnit
136 instance ToJSON ContextualUnit
137 instance FromJSON PhyloLabel
138 instance ToJSON PhyloLabel
139 instance FromJSON Tagger
140 instance ToJSON Tagger
141 instance FromJSON Sort
142 instance ToJSON Sort
143 instance FromJSON Order
144 instance ToJSON Order
145 instance FromJSON Filter
146 instance ToJSON Filter
147 instance FromJSON Synchrony
148 instance ToJSON Synchrony
149
150
151 -- | Software parameters
152 data Software =
153 Software { _software_name :: Text
154 , _software_version :: Text
155 } deriving (Generic, Show, Eq)
156
157 defaultSoftware :: Software
158 defaultSoftware =
159 Software { _software_name = pack "Gargantext"
160 , _software_version = pack "v4" }
161
162
163 -- | Global parameters of a Phylo
164 data PhyloParam =
165 PhyloParam { _phyloParam_version :: Text
166 , _phyloParam_software :: Software
167 , _phyloParam_config :: Config
168 } deriving (Generic, Show, Eq)
169
170 defaultPhyloParam :: PhyloParam
171 defaultPhyloParam =
172 PhyloParam { _phyloParam_version = pack "v2.adaptative"
173 , _phyloParam_software = defaultSoftware
174 , _phyloParam_config = defaultConfig }
175
176
177 ------------------
178 -- | Document | --
179 ------------------
180
181
182 -- | Date : a simple Integer
183 type Date = Int
184
185 -- | Ngrams : a contiguous sequence of n terms
186 type Ngrams = Text
187
188 -- | Document : a piece of Text linked to a Date
189 data Document = Document
190 { date :: Date
191 , text :: [Ngrams]
192 } deriving (Eq,Show,Generic,NFData)
193
194
195 --------------------
196 -- | Foundation | --
197 --------------------
198
199
200 -- | The Foundations of a Phylo created from a given TermList
201 data PhyloFoundations = PhyloFoundations
202 { _foundations_roots :: !(Vector Ngrams)
203 , _foundations_mapList :: TermList
204 } deriving (Generic, Show, Eq)
205
206
207 ---------------------------
208 -- | Coocurency Matrix | --
209 ---------------------------
210
211
212 -- | Cooc : a coocurency matrix between two ngrams
213 type Cooc = Map (Int,Int) Double
214
215
216 -------------------
217 -- | Phylomemy | --
218 -------------------
219
220
221 -- | Phylo datatype of a phylomemy
222 -- foundations : the foundations of the phylo
223 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
224 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
225 -- param : the parameters of the phylomemy (with the user's configuration)
226 -- periods : the temporal steps of a phylomemy
227 data Phylo =
228 Phylo { _phylo_foundations :: PhyloFoundations
229 , _phylo_timeCooc :: !(Map Date Cooc)
230 , _phylo_timeDocs :: !(Map Date Double)
231 , _phylo_param :: PhyloParam
232 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
233 }
234 deriving (Generic, Show, Eq)
235
236
237 -- | PhyloPeriodId : the id of a given period
238 type PhyloPeriodId = (Date,Date)
239
240 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
241 -- id: tuple (start date, end date) of the temporal step of the phylomemy
242 -- levels: levels of granularity
243 data PhyloPeriod =
244 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
245 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
246 } deriving (Generic, Show, Eq)
247
248
249 -- | Level : a level of clustering
250 type Level = Int
251
252 -- | PhyloLevelId : the id of a level of clustering in a given period
253 type PhyloLevelId = (PhyloPeriodId,Level)
254
255 -- | PhyloLevel : levels of phylomemy on a synchronic axis
256 -- Levels description:
257 -- Level 0: The foundations and the base of the phylo
258 -- Level 1: First level of clustering (the Fis)
259 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
260 data PhyloLevel =
261 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
262 , _phylo_levelLevel :: Level
263 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
264 }
265 deriving (Generic, Show, Eq)
266
267
268 type PhyloGroupId = (PhyloLevelId, Int)
269
270 -- | BranchId : (a level, a sequence of branch index)
271 -- the sequence is a path of heritage from the most to the less specific branch
272 type PhyloBranchId = (Level, [Int])
273
274 -- | PhyloGroup : group of ngrams at each level and period
275 data PhyloGroup =
276 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
277 , _phylo_groupLevel :: Level
278 , _phylo_groupIndex :: Int
279 , _phylo_groupLabel :: Text
280 , _phylo_groupSupport :: Support
281 , _phylo_groupNgrams :: [Int]
282 , _phylo_groupCooc :: !(Cooc)
283 , _phylo_groupBranchId :: PhyloBranchId
284 , _phylo_groupMeta :: Map Text [Double]
285 , _phylo_groupLevelParents :: [Pointer]
286 , _phylo_groupLevelChilds :: [Pointer]
287 , _phylo_groupPeriodParents :: [Pointer]
288 , _phylo_groupPeriodChilds :: [Pointer]
289 }
290 deriving (Generic, Show, Eq, NFData)
291
292 -- | Weight : A generic mesure that can be associated with an Id
293 type Weight = Double
294
295 -- | Pointer : A weighted pointer to a given PhyloGroup
296 type Pointer = (PhyloGroupId, Weight)
297
298 data Filiation = ToParents | ToChilds deriving (Generic, Show)
299 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
300
301
302 ---------------------------
303 -- | Frequent Item Set | --
304 ---------------------------
305
306 -- | Clique : Set of ngrams cooccurring in the same Document
307 type Clique = Set Ngrams
308
309 -- | Support : Number of Documents where a Clique occurs
310 type Support = Int
311
312 -- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
313 data PhyloFis = PhyloFis
314 { _phyloFis_clique :: Clique
315 , _phyloFis_support :: Support
316 , _phyloFis_period :: (Date,Date)
317 } deriving (Generic,NFData,Show,Eq)
318
319
320 ----------------
321 -- | Export | --
322 ----------------
323
324 type DotId = TextLazy.Text
325
326 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
327
328 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
329
330 data Order = Asc | Desc deriving (Show,Generic,Eq)
331
332 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
333
334 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
335
336 data PhyloLabel =
337 BranchLabel
338 { _branch_labelTagger :: Tagger
339 , _branch_labelSize :: Int }
340 | GroupLabel
341 { _group_labelTagger :: Tagger
342 , _group_labelSize :: Int }
343 deriving (Show,Generic,Eq)
344
345 data PhyloBranch =
346 PhyloBranch
347 { _branch_id :: PhyloBranchId
348 , _branch_label :: Text
349 , _branch_meta :: Map Text [Double]
350 } deriving (Generic, Show)
351
352 data PhyloExport =
353 PhyloExport
354 { _export_groups :: [PhyloGroup]
355 , _export_branches :: [PhyloBranch]
356 } deriving (Generic, Show)
357
358 ----------------
359 -- | Lenses | --
360 ----------------
361
362 makeLenses ''Config
363 makeLenses ''Proximity
364 makeLenses ''ContextualUnit
365 makeLenses ''PhyloLabel
366 makeLenses ''TimeUnit
367 makeLenses ''PhyloFoundations
368 makeLenses ''PhyloFis
369 makeLenses ''Phylo
370 makeLenses ''PhyloPeriod
371 makeLenses ''PhyloLevel
372 makeLenses ''PhyloGroup
373 makeLenses ''PhyloParam
374 makeLenses ''PhyloExport
375 makeLenses ''PhyloBranch
376
377 ------------------------
378 -- | JSON instances | --
379 ------------------------
380
381
382 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)