]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
[FIX] Merge
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Text.Context (TermList)
40 import Gargantext.Core.Utils.Prefix (unPrefix)
41 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
42 import Gargantext.Prelude
43 import qualified Data.Text.Lazy as TextLazy
44
45 ----------------
46 -- | PhyloConfig | --
47 ----------------
48
49 data CorpusParser =
50 Wos {_wos_limit :: Int}
51 | Csv {_csv_limit :: Int}
52 | Csv' {_csv'_limit :: Int}
53 deriving (Show,Generic,Eq)
54
55 instance ToSchema CorpusParser where
56 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
57
58
59 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
60 instance ToSchema ListParser
61
62
63 data SeaElevation =
64 Constante
65 { _cons_start :: Double
66 , _cons_step :: Double }
67 | Adaptative
68 { _adap_granularity :: Double }
69 deriving (Show,Generic,Eq)
70
71 instance ToSchema SeaElevation
72
73 data Proximity =
74 WeightedLogJaccard
75 { _wlj_sensibility :: Double
76 {-
77 -- , _wlj_thresholdInit :: Double
78 -- , _wlj_thresholdStep :: Double
79 -- | max height for sea level in temporal matching
80 -- , _wlj_elevation :: Double
81 -}
82 }
83 | WeightedLogSim
84 { _wlj_sensibility :: Double
85 {-
86 -- , _wlj_thresholdInit :: Double
87 -- , _wlj_thresholdStep :: Double
88 -- | max height for sea level in temporal matching
89 -- , _wlj_elevation :: Double
90 -}
91 }
92 | Hamming { _wlj_sensibility :: Double }
93
94 deriving (Show,Generic,Eq)
95
96 instance ToSchema Proximity where
97 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
98
99
100 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
101 deriving (Show,Generic,Eq, ToSchema)
102
103 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
104 deriving (Show,Generic,Eq)
105
106 instance ToSchema SynchronyStrategy where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
108
109
110 data Synchrony =
111 ByProximityThreshold
112 { _bpt_threshold :: Double
113 , _bpt_sensibility :: Double
114 , _bpt_scope :: SynchronyScope
115 , _bpt_strategy :: SynchronyStrategy }
116 | ByProximityDistribution
117 { _bpd_sensibility :: Double
118 , _bpd_strategy :: SynchronyStrategy }
119 deriving (Show,Generic,Eq)
120
121 instance ToSchema Synchrony where
122 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
123
124
125
126 data TimeUnit =
127 Epoch
128 { _epoch_period :: Int
129 , _epoch_step :: Int
130 , _epoch_matchingFrame :: Int }
131 | Year
132 { _year_period :: Int
133 , _year_step :: Int
134 , _year_matchingFrame :: Int }
135 | Month
136 { _month_period :: Int
137 , _month_step :: Int
138 , _month_matchingFrame :: Int }
139 | Week
140 { _week_period :: Int
141 , _week_step :: Int
142 , _week_matchingFrame :: Int }
143 | Day
144 { _day_period :: Int
145 , _day_step :: Int
146 , _day_matchingFrame :: Int }
147 deriving (Show,Generic,Eq)
148
149 instance ToSchema TimeUnit where
150 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
151
152
153 data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
154
155 instance ToSchema CliqueFilter where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
157
158
159
160 data Clique =
161 Fis
162 { _fis_support :: Int
163 , _fis_size :: Int }
164 | MaxClique
165 { _mcl_size :: Int
166 , _mcl_threshold :: Double
167 , _mcl_filter :: CliqueFilter }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Clique where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
172
173
174 data Quality =
175 Quality { _qua_granularity :: Double
176 , _qua_minBranch :: Int }
177 deriving (Show,Generic,Eq)
178
179 instance ToSchema Quality where
180 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
181
182
183 data PhyloConfig =
184 PhyloConfig { corpusPath :: FilePath
185 , listPath :: FilePath
186 , outputPath :: FilePath
187 , corpusParser :: CorpusParser
188 , listParser :: ListParser
189 , phyloName :: Text
190 , phyloLevel :: Int
191 , phyloProximity :: Proximity
192 , seaElevation :: SeaElevation
193 , findAncestors :: Bool
194 , phyloSynchrony :: Synchrony
195 , phyloQuality :: Quality
196 , timeUnit :: TimeUnit
197 , clique :: Clique
198 , exportLabel :: [PhyloLabel]
199 , exportSort :: Sort
200 , exportFilter :: [Filter]
201 } deriving (Show,Generic,Eq)
202
203
204 ------------------------------------------------------------------------
205 data PhyloSubConfig =
206 PhyloSubConfig { _sc_phyloProximity :: Double
207 , _sc_phyloSynchrony :: Double
208 , _sc_phyloQuality :: Double
209 , _sc_timeUnit :: TimeUnit
210 , _sc_clique :: Clique
211 , _sc_exportFilter :: Double
212 }
213 deriving (Show,Generic,Eq)
214
215
216 subConfig2config :: PhyloSubConfig -> PhyloConfig
217 subConfig2config subConfig = defaultConfig { phyloProximity = WeightedLogJaccard $ _sc_phyloProximity subConfig
218 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
219 , phyloQuality = Quality (_sc_phyloQuality subConfig) 1
220 , timeUnit = _sc_timeUnit subConfig
221 , clique = _sc_clique subConfig
222 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
223 }
224
225 ------------------------------------------------------------------------
226 defaultConfig :: PhyloConfig
227 defaultConfig =
228 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
229 , listPath = "list.csv" -- useful for commandline only
230 , outputPath = "data/"
231 , corpusParser = Csv 100000
232 , listParser = V4
233 , phyloName = pack "Phylo Name"
234 , phyloLevel = 2
235 , phyloProximity = WeightedLogJaccard 0.5
236 , seaElevation = Constante 0.1 0.1
237 , findAncestors = False
238 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
239 , phyloQuality = Quality 0.5 1
240 , timeUnit = Year 3 1 5
241 , clique = MaxClique 5 0.0001 ByThreshold
242 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
243 , exportSort = ByHierarchy Desc
244 , exportFilter = [ByBranchSize 3]
245 }
246
247 -- Main Instances
248 instance ToSchema PhyloConfig
249 instance ToSchema PhyloSubConfig
250
251 instance FromJSON PhyloConfig
252 instance ToJSON PhyloConfig
253
254 instance FromJSON PhyloSubConfig
255 instance ToJSON PhyloSubConfig
256
257 instance FromJSON CorpusParser
258 instance ToJSON CorpusParser
259
260 instance FromJSON ListParser
261 instance ToJSON ListParser
262
263 instance FromJSON Proximity
264 instance ToJSON Proximity
265
266 instance FromJSON SeaElevation
267 instance ToJSON SeaElevation
268
269 instance FromJSON TimeUnit
270 instance ToJSON TimeUnit
271
272 instance FromJSON CliqueFilter
273 instance ToJSON CliqueFilter
274
275 instance FromJSON Clique
276 instance ToJSON Clique
277
278 instance FromJSON PhyloLabel
279 instance ToJSON PhyloLabel
280
281 instance FromJSON Tagger
282 instance ToJSON Tagger
283
284 instance FromJSON Sort
285 instance ToJSON Sort
286
287 instance FromJSON Order
288 instance ToJSON Order
289
290 instance FromJSON Filter
291 instance ToJSON Filter
292
293 instance FromJSON SynchronyScope
294 instance ToJSON SynchronyScope
295
296 instance FromJSON SynchronyStrategy
297 instance ToJSON SynchronyStrategy
298
299 instance FromJSON Synchrony
300 instance ToJSON Synchrony
301
302 instance FromJSON Quality
303 instance ToJSON Quality
304
305
306 -- | Software parameters
307 data Software =
308 Software { _software_name :: Text
309 , _software_version :: Text
310 } deriving (Generic, Show, Eq)
311
312 instance ToSchema Software where
313 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
314
315
316
317 defaultSoftware :: Software
318 defaultSoftware =
319 Software { _software_name = pack "Gargantext"
320 , _software_version = pack "v4" }
321
322
323 -- | Global parameters of a Phylo
324 data PhyloParam =
325 PhyloParam { _phyloParam_version :: Text
326 , _phyloParam_software :: Software
327 , _phyloParam_config :: PhyloConfig
328 } deriving (Generic, Show, Eq)
329
330 instance ToSchema PhyloParam where
331 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
332
333
334
335 defaultPhyloParam :: PhyloParam
336 defaultPhyloParam =
337 PhyloParam { _phyloParam_version = pack "v2.adaptative"
338 , _phyloParam_software = defaultSoftware
339 , _phyloParam_config = defaultConfig }
340
341
342 ------------------
343 -- | Document | --
344 ------------------
345
346 -- | Date : a simple Integer
347 type Date = Int
348
349 -- | Ngrams : a contiguous sequence of n terms
350 type Ngrams = Text
351
352 -- Document : a piece of Text linked to a Date
353 -- date = computational date; date' = original string date yyyy-mm-dd
354 -- Export Database to Document
355 data Document = Document
356 { date :: Date -- datatype Date {unDate :: Int}
357 , date' :: Text -- show date
358 , text :: [Ngrams]
359 , weight :: Maybe Double
360 , sources :: [Text]
361 } deriving (Eq,Show,Generic,NFData)
362
363
364 --------------------
365 -- | Foundation | --
366 --------------------
367
368
369 -- | The Foundations of a Phylo created from a given TermList
370 data PhyloFoundations = PhyloFoundations
371 { _foundations_roots :: !(Vector Ngrams)
372 , _foundations_mapList :: TermList
373 } deriving (Generic, Show, Eq)
374
375 instance ToSchema PhyloFoundations where
376 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
377
378
379
380 data PhyloSources = PhyloSources
381 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
382
383 instance ToSchema PhyloSources where
384 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
385
386 ---------------------------
387 -- | Coocurency Matrix | --
388 ---------------------------
389
390
391 -- | Cooc : a coocurency matrix between two ngrams
392 type Cooc = Map (Int,Int) Double
393
394
395 -------------------
396 -- | Phylomemy | --
397 -------------------
398
399
400 -- | Phylo datatype of a phylomemy
401 -- foundations : the foundations of the phylo
402 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
403 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
404 -- param : the parameters of the phylomemy (with the user's configuration)
405 -- periods : the temporal steps of a phylomemy
406 data Phylo =
407 Phylo { _phylo_foundations :: PhyloFoundations
408 , _phylo_sources :: PhyloSources
409 , _phylo_timeCooc :: !(Map Date Cooc)
410 , _phylo_timeDocs :: !(Map Date Double)
411 , _phylo_termFreq :: !(Map Int Double)
412 , _phylo_lastTermFreq :: !(Map Int Double)
413 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
414 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
415 , _phylo_param :: PhyloParam
416 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
417 }
418 deriving (Generic, Show, Eq)
419
420 instance ToSchema Phylo where
421 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
422
423
424 -- | PhyloPeriodId : the id of a given period
425 type PhyloPeriodId = (Date,Date)
426
427 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
428 -- id: tuple (start date, end date) of the temporal step of the phylomemy
429 -- levels: levels of granularity
430 data PhyloPeriod =
431 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
432 , _phylo_periodPeriod' :: (Text,Text)
433 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
434 } deriving (Generic, Show, Eq)
435
436 instance ToSchema PhyloPeriod where
437 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
438
439
440
441 -- | Level : a level of clustering
442 type Level = Int
443
444 -- | PhyloLevelId : the id of a level of clustering in a given period
445 type PhyloLevelId = (PhyloPeriodId,Level)
446
447 -- | PhyloLevel : levels of phylomemy on a synchronic axis
448 -- Levels description:
449 -- Level 0: The foundations and the base of the phylo
450 -- Level 1: First level of clustering (the Fis)
451 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
452 data PhyloLevel =
453 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
454 , _phylo_levelPeriod' :: (Text,Text)
455 , _phylo_levelLevel :: Level
456 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
457 }
458 deriving (Generic, Show, Eq)
459
460 instance ToSchema PhyloLevel where
461 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
462
463
464 type PhyloGroupId = (PhyloLevelId, Int)
465
466 -- | BranchId : (a level, a sequence of branch index)
467 -- the sequence is a path of heritage from the most to the less specific branch
468 type PhyloBranchId = (Level, [Int])
469
470 -- | PhyloGroup : group of ngrams at each level and period
471 data PhyloGroup =
472 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
473 , _phylo_groupPeriod' :: (Text,Text)
474 , _phylo_groupLevel :: Level
475 , _phylo_groupIndex :: Int
476 , _phylo_groupLabel :: Text
477 , _phylo_groupSupport :: Support
478 , _phylo_groupWeight :: Maybe Double
479 , _phylo_groupSources :: [Int]
480 , _phylo_groupNgrams :: [Int]
481 , _phylo_groupCooc :: !(Cooc)
482 , _phylo_groupBranchId :: PhyloBranchId
483 , _phylo_groupMeta :: Map Text [Double]
484 , _phylo_groupLevelParents :: [Pointer]
485 , _phylo_groupLevelChilds :: [Pointer]
486 , _phylo_groupPeriodParents :: [Pointer]
487 , _phylo_groupPeriodChilds :: [Pointer]
488 , _phylo_groupAncestors :: [Pointer]
489 , _phylo_groupPeriodMemoryParents :: [Pointer']
490 , _phylo_groupPeriodMemoryChilds :: [Pointer']
491 }
492 deriving (Generic, Show, Eq, NFData)
493
494 instance ToSchema PhyloGroup where
495 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
496
497
498 -- | Weight : A generic mesure that can be associated with an Id
499 type Weight = Double
500 type Thr = Double
501
502 -- | Pointer : A weighted pointer to a given PhyloGroup
503 type Pointer = (PhyloGroupId, Weight)
504 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
505 type Pointer' = (PhyloGroupId, (Thr,Weight))
506
507 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
508 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
509
510
511 ----------------------
512 -- | Phylo Clique | --
513 ----------------------
514
515 -- | Support : Number of Documents where a Clique occurs
516 type Support = Int
517
518 data PhyloClique = PhyloClique
519 { _phyloClique_nodes :: [Int]
520 , _phyloClique_support :: Support
521 , _phyloClique_period :: (Date,Date)
522 , _phyloClique_weight :: Maybe Double
523 , _phyloClique_sources :: [Int]
524 } deriving (Generic,NFData,Show,Eq)
525
526 ----------------
527 -- | Export | --
528 ----------------
529
530 type DotId = TextLazy.Text
531
532 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
533
534 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
535 instance ToSchema Filter where
536 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
537
538
539 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
540
541 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
542 instance ToSchema Sort where
543 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
544
545
546 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
547 instance ToSchema Tagger where
548 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
549
550
551 data PhyloLabel =
552 BranchLabel
553 { _branch_labelTagger :: Tagger
554 , _branch_labelSize :: Int }
555 | GroupLabel
556 { _group_labelTagger :: Tagger
557 , _group_labelSize :: Int }
558 deriving (Show,Generic,Eq)
559
560 instance ToSchema PhyloLabel where
561 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
562
563
564 data PhyloBranch =
565 PhyloBranch
566 { _branch_id :: PhyloBranchId
567 , _branch_canonId :: [Int]
568 , _branch_seaLevel :: [Double]
569 , _branch_x :: Double
570 , _branch_y :: Double
571 , _branch_w :: Double
572 , _branch_t :: Double
573 , _branch_label :: Text
574 , _branch_meta :: Map Text [Double]
575 } deriving (Generic, Show, Eq)
576
577 instance ToSchema PhyloBranch where
578 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
579
580 data PhyloExport =
581 PhyloExport
582 { _export_groups :: [PhyloGroup]
583 , _export_branches :: [PhyloBranch]
584 } deriving (Generic, Show)
585 instance ToSchema PhyloExport where
586 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
587
588
589 ----------------
590 -- | Lenses | --
591 ----------------
592
593 makeLenses ''PhyloConfig
594 makeLenses ''PhyloSubConfig
595 makeLenses ''Proximity
596 makeLenses ''SeaElevation
597 makeLenses ''Quality
598 makeLenses ''Clique
599 makeLenses ''PhyloLabel
600 makeLenses ''TimeUnit
601 makeLenses ''PhyloFoundations
602 makeLenses ''PhyloClique
603 makeLenses ''Phylo
604 makeLenses ''PhyloPeriod
605 makeLenses ''PhyloLevel
606 makeLenses ''PhyloGroup
607 makeLenses ''PhyloParam
608 makeLenses ''PhyloExport
609 makeLenses ''PhyloBranch
610
611 ------------------------
612 -- | JSON instances | --
613 ------------------------
614
615 instance FromJSON Phylo
616 instance ToJSON Phylo
617
618 instance FromJSON PhyloSources
619 instance ToJSON PhyloSources
620
621 instance FromJSON PhyloParam
622 instance ToJSON PhyloParam
623
624 instance FromJSON PhyloPeriod
625 instance ToJSON PhyloPeriod
626
627 instance FromJSON PhyloLevel
628 instance ToJSON PhyloLevel
629
630 instance FromJSON Software
631 instance ToJSON Software
632
633 instance FromJSON PhyloGroup
634 instance ToJSON PhyloGroup
635
636 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)