]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
add a list parser param
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Data.Swagger
30 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
31 import Control.DeepSeq (NFData)
32 import Control.Lens (makeLenses)
33 import Data.Aeson
34 import Data.Aeson.TH (deriveJSON)
35 import Data.Map (Map)
36 import Data.Text (Text, pack)
37 import Data.Vector (Vector)
38 import GHC.Generics
39 import GHC.IO (FilePath)
40 import Gargantext.Core.Text.Context (TermList)
41 import Gargantext.Core.Utils.Prefix (unPrefix)
42 import Gargantext.Prelude
43 import qualified Data.Text.Lazy as TextLazy
44
45 ----------------
46 -- | Config | --
47 ----------------
48
49 data CorpusParser =
50 Wos {_wos_limit :: Int}
51 | Csv {_csv_limit :: Int}
52 | Csv' {_csv'_limit :: Int}
53 deriving (Show,Generic,Eq)
54
55 instance ToSchema CorpusParser where
56 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
57
58
59 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
60 instance ToSchema ListParser
61
62
63 data SeaElevation =
64 Constante
65 { _cons_start :: Double
66 , _cons_step :: Double }
67 | Adaptative
68 { _adap_granularity :: Double }
69 deriving (Show,Generic,Eq)
70
71 instance ToSchema SeaElevation
72
73 data Proximity =
74 WeightedLogJaccard
75 { _wlj_sensibility :: Double
76 {-
77 -- , _wlj_thresholdInit :: Double
78 -- , _wlj_thresholdStep :: Double
79 -- | max height for sea level in temporal matching
80 -- , _wlj_elevation :: Double
81 -}
82 }
83 | WeightedLogSim
84 { _wlj_sensibility :: Double
85 {-
86 -- , _wlj_thresholdInit :: Double
87 -- , _wlj_thresholdStep :: Double
88 -- | max height for sea level in temporal matching
89 -- , _wlj_elevation :: Double
90 -}
91 }
92 | Hamming { _wlj_sensibility :: Double }
93
94 deriving (Show,Generic,Eq)
95
96 instance ToSchema Proximity where
97 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
98
99
100 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
101 deriving (Show,Generic,Eq, ToSchema)
102
103 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
104 deriving (Show,Generic,Eq)
105
106 instance ToSchema SynchronyStrategy where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
108
109
110 data Synchrony =
111 ByProximityThreshold
112 { _bpt_threshold :: Double
113 , _bpt_sensibility :: Double
114 , _bpt_scope :: SynchronyScope
115 , _bpt_strategy :: SynchronyStrategy }
116 | ByProximityDistribution
117 { _bpd_sensibility :: Double
118 , _bpd_strategy :: SynchronyStrategy }
119 deriving (Show,Generic,Eq)
120
121 instance ToSchema Synchrony where
122 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
123
124
125
126 data TimeUnit =
127 Epoch
128 { _epoch_period :: Int
129 , _epoch_step :: Int
130 , _epoch_matchingFrame :: Int }
131 | Year
132 { _year_period :: Int
133 , _year_step :: Int
134 , _year_matchingFrame :: Int }
135 | Month
136 { _month_period :: Int
137 , _month_step :: Int
138 , _month_matchingFrame :: Int }
139 | Week
140 { _week_period :: Int
141 , _week_step :: Int
142 , _week_matchingFrame :: Int }
143 | Day
144 { _day_period :: Int
145 , _day_step :: Int
146 , _day_matchingFrame :: Int }
147 deriving (Show,Generic,Eq)
148
149 instance ToSchema TimeUnit where
150 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
151
152
153 data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
154
155 instance ToSchema CliqueFilter where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
157
158
159
160 data Clique =
161 Fis
162 { _fis_support :: Int
163 , _fis_size :: Int }
164 | MaxClique
165 { _mcl_size :: Int
166 , _mcl_threshold :: Double
167 , _mcl_filter :: CliqueFilter }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Clique where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
172
173
174 data Quality =
175 Quality { _qua_granularity :: Double
176 , _qua_minBranch :: Int }
177 deriving (Show,Generic,Eq)
178
179 instance ToSchema Quality where
180 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
181
182
183
184 data Config =
185 Config { corpusPath :: FilePath
186 , listPath :: FilePath
187 , outputPath :: FilePath
188 , corpusParser :: CorpusParser
189 , listParser :: ListParser
190 , phyloName :: Text
191 , phyloLevel :: Int
192 , phyloProximity :: Proximity
193 , seaElevation :: SeaElevation
194 , findAncestors :: Bool
195 , phyloSynchrony :: Synchrony
196 , phyloQuality :: Quality
197 , timeUnit :: TimeUnit
198 , clique :: Clique
199 , exportLabel :: [PhyloLabel]
200 , exportSort :: Sort
201 , exportFilter :: [Filter]
202 } deriving (Show,Generic,Eq)
203
204 instance ToSchema Config
205
206
207 defaultConfig :: Config
208 defaultConfig =
209 Config { corpusPath = "corpus.csv" -- useful for commandline only
210 , listPath = "list.csv" -- useful for commandline only
211 , outputPath = "data/"
212 , corpusParser = Csv 100000
213 , listParser = V4
214 , phyloName = pack "Phylo Name"
215 , phyloLevel = 2
216 , phyloProximity = WeightedLogJaccard 0.5
217 , seaElevation = Constante 0.1 0.1
218 , findAncestors = False
219 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
220 , phyloQuality = Quality 0.5 1
221 , timeUnit = Year 3 1 5
222 , clique = MaxClique 5 0.0001 ByThreshold
223 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
224 , exportSort = ByHierarchy Desc
225 , exportFilter = [ByBranchSize 3]
226 }
227
228 instance FromJSON Config
229 instance ToJSON Config
230
231 instance FromJSON CorpusParser
232 instance ToJSON CorpusParser
233
234 instance FromJSON ListParser
235 instance ToJSON ListParser
236
237 instance FromJSON Proximity
238 instance ToJSON Proximity
239
240 instance FromJSON SeaElevation
241 instance ToJSON SeaElevation
242
243 instance FromJSON TimeUnit
244 instance ToJSON TimeUnit
245
246 instance FromJSON CliqueFilter
247 instance ToJSON CliqueFilter
248
249 instance FromJSON Clique
250 instance ToJSON Clique
251
252 instance FromJSON PhyloLabel
253 instance ToJSON PhyloLabel
254
255 instance FromJSON Tagger
256 instance ToJSON Tagger
257
258 instance FromJSON Sort
259 instance ToJSON Sort
260
261 instance FromJSON Order
262 instance ToJSON Order
263
264 instance FromJSON Filter
265 instance ToJSON Filter
266
267 instance FromJSON SynchronyScope
268 instance ToJSON SynchronyScope
269
270 instance FromJSON SynchronyStrategy
271 instance ToJSON SynchronyStrategy
272
273 instance FromJSON Synchrony
274 instance ToJSON Synchrony
275
276 instance FromJSON Quality
277 instance ToJSON Quality
278
279
280 -- | Software parameters
281 data Software =
282 Software { _software_name :: Text
283 , _software_version :: Text
284 } deriving (Generic, Show, Eq)
285
286 instance ToSchema Software where
287 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
288
289
290
291 defaultSoftware :: Software
292 defaultSoftware =
293 Software { _software_name = pack "Gargantext"
294 , _software_version = pack "v4" }
295
296
297 -- | Global parameters of a Phylo
298 data PhyloParam =
299 PhyloParam { _phyloParam_version :: Text
300 , _phyloParam_software :: Software
301 , _phyloParam_config :: Config
302 } deriving (Generic, Show, Eq)
303
304 instance ToSchema PhyloParam where
305 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
306
307
308
309 defaultPhyloParam :: PhyloParam
310 defaultPhyloParam =
311 PhyloParam { _phyloParam_version = pack "v2.adaptative"
312 , _phyloParam_software = defaultSoftware
313 , _phyloParam_config = defaultConfig }
314
315
316 ------------------
317 -- | Document | --
318 ------------------
319
320 -- | Date : a simple Integer
321 type Date = Int
322
323 -- | Ngrams : a contiguous sequence of n terms
324 type Ngrams = Text
325
326 -- Document : a piece of Text linked to a Date
327 -- date = computational date; date' = original string date yyyy-mm-dd
328 -- Export Database to Document
329 data Document = Document
330 { date :: Date -- datatype Date {unDate :: Int}
331 , date' :: Text -- show date
332 , text :: [Ngrams]
333 , weight :: Maybe Double
334 , sources :: [Text]
335 } deriving (Eq,Show,Generic,NFData)
336
337
338 --------------------
339 -- | Foundation | --
340 --------------------
341
342
343 -- | The Foundations of a Phylo created from a given TermList
344 data PhyloFoundations = PhyloFoundations
345 { _foundations_roots :: !(Vector Ngrams)
346 , _foundations_mapList :: TermList
347 } deriving (Generic, Show, Eq)
348
349 instance ToSchema PhyloFoundations where
350 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
351
352
353
354 data PhyloSources = PhyloSources
355 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
356
357 instance ToSchema PhyloSources where
358 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
359
360 ---------------------------
361 -- | Coocurency Matrix | --
362 ---------------------------
363
364
365 -- | Cooc : a coocurency matrix between two ngrams
366 type Cooc = Map (Int,Int) Double
367
368
369 -------------------
370 -- | Phylomemy | --
371 -------------------
372
373
374 -- | Phylo datatype of a phylomemy
375 -- foundations : the foundations of the phylo
376 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
377 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
378 -- param : the parameters of the phylomemy (with the user's configuration)
379 -- periods : the temporal steps of a phylomemy
380 data Phylo =
381 Phylo { _phylo_foundations :: PhyloFoundations
382 , _phylo_sources :: PhyloSources
383 , _phylo_timeCooc :: !(Map Date Cooc)
384 , _phylo_timeDocs :: !(Map Date Double)
385 , _phylo_termFreq :: !(Map Int Double)
386 , _phylo_lastTermFreq :: !(Map Int Double)
387 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
388 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
389 , _phylo_param :: PhyloParam
390 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
391 }
392 deriving (Generic, Show, Eq)
393
394 instance ToSchema Phylo where
395 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
396
397
398 -- | PhyloPeriodId : the id of a given period
399 type PhyloPeriodId = (Date,Date)
400
401 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
402 -- id: tuple (start date, end date) of the temporal step of the phylomemy
403 -- levels: levels of granularity
404 data PhyloPeriod =
405 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
406 , _phylo_periodPeriod' :: (Text,Text)
407 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
408 } deriving (Generic, Show, Eq)
409
410 instance ToSchema PhyloPeriod where
411 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
412
413
414
415 -- | Level : a level of clustering
416 type Level = Int
417
418 -- | PhyloLevelId : the id of a level of clustering in a given period
419 type PhyloLevelId = (PhyloPeriodId,Level)
420
421 -- | PhyloLevel : levels of phylomemy on a synchronic axis
422 -- Levels description:
423 -- Level 0: The foundations and the base of the phylo
424 -- Level 1: First level of clustering (the Fis)
425 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
426 data PhyloLevel =
427 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
428 , _phylo_levelPeriod' :: (Text,Text)
429 , _phylo_levelLevel :: Level
430 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
431 }
432 deriving (Generic, Show, Eq)
433
434 instance ToSchema PhyloLevel where
435 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
436
437
438 type PhyloGroupId = (PhyloLevelId, Int)
439
440 -- | BranchId : (a level, a sequence of branch index)
441 -- the sequence is a path of heritage from the most to the less specific branch
442 type PhyloBranchId = (Level, [Int])
443
444 -- | PhyloGroup : group of ngrams at each level and period
445 data PhyloGroup =
446 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
447 , _phylo_groupPeriod' :: (Text,Text)
448 , _phylo_groupLevel :: Level
449 , _phylo_groupIndex :: Int
450 , _phylo_groupLabel :: Text
451 , _phylo_groupSupport :: Support
452 , _phylo_groupWeight :: Maybe Double
453 , _phylo_groupSources :: [Int]
454 , _phylo_groupNgrams :: [Int]
455 , _phylo_groupCooc :: !(Cooc)
456 , _phylo_groupBranchId :: PhyloBranchId
457 , _phylo_groupMeta :: Map Text [Double]
458 , _phylo_groupLevelParents :: [Pointer]
459 , _phylo_groupLevelChilds :: [Pointer]
460 , _phylo_groupPeriodParents :: [Pointer]
461 , _phylo_groupPeriodChilds :: [Pointer]
462 , _phylo_groupAncestors :: [Pointer]
463 , _phylo_groupPeriodMemoryParents :: [Pointer']
464 , _phylo_groupPeriodMemoryChilds :: [Pointer']
465 }
466 deriving (Generic, Show, Eq, NFData)
467
468 instance ToSchema PhyloGroup where
469 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
470
471
472 -- | Weight : A generic mesure that can be associated with an Id
473 type Weight = Double
474 type Thr = Double
475
476 -- | Pointer : A weighted pointer to a given PhyloGroup
477 type Pointer = (PhyloGroupId, Weight)
478 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
479 type Pointer' = (PhyloGroupId, (Thr,Weight))
480
481 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
482 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
483
484
485 ----------------------
486 -- | Phylo Clique | --
487 ----------------------
488
489 -- | Support : Number of Documents where a Clique occurs
490 type Support = Int
491
492 data PhyloClique = PhyloClique
493 { _phyloClique_nodes :: [Int]
494 , _phyloClique_support :: Support
495 , _phyloClique_period :: (Date,Date)
496 , _phyloClique_weight :: Maybe Double
497 , _phyloClique_sources :: [Int]
498 } deriving (Generic,NFData,Show,Eq)
499
500 ----------------
501 -- | Export | --
502 ----------------
503
504 type DotId = TextLazy.Text
505
506 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
507
508 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
509 instance ToSchema Filter where
510 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
511
512
513 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
514
515 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
516 instance ToSchema Sort where
517 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
518
519
520 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
521 instance ToSchema Tagger where
522 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
523
524
525 data PhyloLabel =
526 BranchLabel
527 { _branch_labelTagger :: Tagger
528 , _branch_labelSize :: Int }
529 | GroupLabel
530 { _group_labelTagger :: Tagger
531 , _group_labelSize :: Int }
532 deriving (Show,Generic,Eq)
533
534 instance ToSchema PhyloLabel where
535 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
536
537
538 data PhyloBranch =
539 PhyloBranch
540 { _branch_id :: PhyloBranchId
541 , _branch_canonId :: [Int]
542 , _branch_seaLevel :: [Double]
543 , _branch_x :: Double
544 , _branch_y :: Double
545 , _branch_w :: Double
546 , _branch_t :: Double
547 , _branch_label :: Text
548 , _branch_meta :: Map Text [Double]
549 } deriving (Generic, Show, Eq)
550
551 instance ToSchema PhyloBranch where
552 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
553
554 data PhyloExport =
555 PhyloExport
556 { _export_groups :: [PhyloGroup]
557 , _export_branches :: [PhyloBranch]
558 } deriving (Generic, Show)
559 instance ToSchema PhyloExport where
560 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
561
562
563 ----------------
564 -- | Lenses | --
565 ----------------
566
567 makeLenses ''Config
568 makeLenses ''Proximity
569 makeLenses ''SeaElevation
570 makeLenses ''Quality
571 makeLenses ''Clique
572 makeLenses ''PhyloLabel
573 makeLenses ''TimeUnit
574 makeLenses ''PhyloFoundations
575 makeLenses ''PhyloClique
576 makeLenses ''Phylo
577 makeLenses ''PhyloPeriod
578 makeLenses ''PhyloLevel
579 makeLenses ''PhyloGroup
580 makeLenses ''PhyloParam
581 makeLenses ''PhyloExport
582 makeLenses ''PhyloBranch
583
584 ------------------------
585 -- | JSON instances | --
586 ------------------------
587
588 instance FromJSON Phylo
589 instance ToJSON Phylo
590
591 instance FromJSON PhyloSources
592 instance ToJSON PhyloSources
593
594 instance FromJSON PhyloParam
595 instance ToJSON PhyloParam
596
597 instance FromJSON PhyloPeriod
598 instance ToJSON PhyloPeriod
599
600 instance FromJSON PhyloLevel
601 instance ToJSON PhyloLevel
602
603 instance FromJSON Software
604 instance ToJSON Software
605
606 instance FromJSON PhyloGroup
607 instance ToJSON PhyloGroup
608
609 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)