]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
Merge branch 'dev' into dev-openalex
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Utils.Prefix (unPrefix)
40 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
41 import Gargantext.Prelude
42 import qualified Data.Text.Lazy as TextLazy
43
44 ---------------------
45 -- | PhyloConfig | --
46 ---------------------
47
48 data CorpusParser =
49 Wos {_wos_limit :: Int}
50 | Csv {_csv_limit :: Int}
51 | Csv' {_csv'_limit :: Int}
52 deriving (Show,Generic,Eq)
53
54 instance ToSchema CorpusParser where
55 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
56
57
58 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
59 instance ToSchema ListParser
60
61
62 data SeaElevation =
63 Constante
64 { _cons_start :: Double
65 , _cons_gap :: Double }
66 | Adaptative
67 { _adap_steps :: Double }
68 | Evolving
69 { _evol_neighborhood :: Bool }
70 deriving (Show,Generic,Eq)
71
72 instance ToSchema SeaElevation
73
74 data PhyloSimilarity =
75 WeightedLogJaccard
76 { _wlj_sensibility :: Double
77 , _wlj_minSharedNgrams :: Int }
78 | WeightedLogSim
79 { _wls_sensibility :: Double
80 , _wls_minSharedNgrams :: Int }
81 | Hamming
82 { _hmg_sensibility :: Double
83 , _hmg_minSharedNgrams :: Int}
84
85 deriving (Show,Generic,Eq)
86
87 instance ToSchema PhyloSimilarity where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
89
90
91 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
92 deriving (Show,Generic,Eq, ToSchema)
93
94 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
95 deriving (Show,Generic,Eq)
96
97 instance ToSchema SynchronyStrategy where
98 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
99
100
101 data Synchrony =
102 ByProximityThreshold
103 { _bpt_threshold :: Double
104 , _bpt_sensibility :: Double
105 , _bpt_scope :: SynchronyScope
106 , _bpt_strategy :: SynchronyStrategy }
107 | ByProximityDistribution
108 { _bpd_sensibility :: Double
109 , _bpd_strategy :: SynchronyStrategy }
110 deriving (Show,Generic,Eq)
111
112 instance ToSchema Synchrony where
113 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
114
115
116
117 data TimeUnit =
118 Epoch
119 { _epoch_period :: Int
120 , _epoch_step :: Int
121 , _epoch_matchingFrame :: Int }
122 | Year
123 { _year_period :: Int
124 , _year_step :: Int
125 , _year_matchingFrame :: Int }
126 | Month
127 { _month_period :: Int
128 , _month_step :: Int
129 , _month_matchingFrame :: Int }
130 | Week
131 { _week_period :: Int
132 , _week_step :: Int
133 , _week_matchingFrame :: Int }
134 | Day
135 { _day_period :: Int
136 , _day_step :: Int
137 , _day_matchingFrame :: Int }
138 deriving (Show,Generic,Eq,NFData)
139
140 instance ToSchema TimeUnit where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
142
143
144 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
145
146 instance ToSchema MaxCliqueFilter where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
148
149
150
151 data Cluster =
152 Fis
153 { _fis_support :: Int
154 , _fis_size :: Int }
155 | MaxClique
156 { _mcl_size :: Int
157 , _mcl_threshold :: Double
158 , _mcl_filter :: MaxCliqueFilter }
159 deriving (Show,Generic,Eq)
160
161 instance ToSchema Cluster where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
163
164
165 data Quality =
166 Quality { _qua_granularity :: Double
167 , _qua_minBranch :: Int }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Quality where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
172
173
174 data PhyloConfig =
175 PhyloConfig { corpusPath :: FilePath
176 , listPath :: FilePath
177 , outputPath :: FilePath
178 , corpusParser :: CorpusParser
179 , listParser :: ListParser
180 , phyloName :: Text
181 , phyloScale :: Int
182 , similarity :: PhyloSimilarity
183 , seaElevation :: SeaElevation
184 , defaultMode :: Bool
185 , findAncestors :: Bool
186 , phyloSynchrony :: Synchrony
187 , phyloQuality :: Quality
188 , timeUnit :: TimeUnit
189 , clique :: Cluster
190 , exportLabel :: [PhyloLabel]
191 , exportSort :: Sort
192 , exportFilter :: [Filter]
193 } deriving (Show,Generic,Eq)
194
195
196 --------------------------------
197 -- | SubConfig API & 1Click | --
198 --------------------------------
199
200 data PhyloSubConfigAPI =
201 PhyloSubConfigAPI { _sc_phyloProximity :: Double
202 , _sc_phyloSynchrony :: Double
203 , _sc_phyloQuality :: Double
204 , _sc_timeUnit :: TimeUnit
205 , _sc_clique :: Cluster
206 , _sc_exportFilter :: Double
207 , _sc_defaultMode :: Bool
208 } deriving (Show,Generic,Eq)
209
210
211 subConfigAPI2config :: PhyloSubConfigAPI -> PhyloConfig
212 subConfigAPI2config subConfig = defaultConfig
213 { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 2
214 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
215 , phyloQuality = Quality (_sc_phyloQuality subConfig) 3
216 , timeUnit = _sc_timeUnit subConfig
217 , clique = _sc_clique subConfig
218 , defaultMode = _sc_defaultMode subConfig
219 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
220 }
221
222 --------------------------
223 -- | SubConfig 1Click | --
224 --------------------------
225
226 defaultConfig :: PhyloConfig
227 defaultConfig =
228 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
229 , listPath = "list.csv" -- useful for commandline only
230 , outputPath = "data/"
231 , corpusParser = Csv 150000
232 , listParser = V4
233 , phyloName = pack "Phylo Name"
234 , phyloScale = 2
235 , similarity = WeightedLogJaccard 0.5 2
236 , seaElevation = Constante 0.1 0.1
237 , defaultMode = False
238 , findAncestors = True
239 , phyloSynchrony = ByProximityThreshold 0.6 0 AllBranches MergeAllGroups
240 , phyloQuality = Quality 0.5 3
241 , timeUnit = Year 3 1 5
242 , clique = Fis 2 3
243 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
244 , exportSort = ByHierarchy Desc
245 , exportFilter = [ByBranchSize 3]
246 }
247
248 -- Main Instances
249 instance ToSchema PhyloConfig
250 instance ToSchema PhyloSubConfigAPI
251
252 instance FromJSON PhyloConfig
253 instance ToJSON PhyloConfig
254
255 instance FromJSON PhyloSubConfigAPI
256 instance ToJSON PhyloSubConfigAPI
257
258 instance FromJSON CorpusParser
259 instance ToJSON CorpusParser
260
261 instance FromJSON ListParser
262 instance ToJSON ListParser
263
264 instance FromJSON PhyloSimilarity
265 instance ToJSON PhyloSimilarity
266
267 instance FromJSON SeaElevation
268 instance ToJSON SeaElevation
269
270 instance FromJSON TimeUnit
271 instance ToJSON TimeUnit
272
273 instance FromJSON MaxCliqueFilter
274 instance ToJSON MaxCliqueFilter
275
276 instance FromJSON Cluster
277 instance ToJSON Cluster
278
279 instance FromJSON PhyloLabel
280 instance ToJSON PhyloLabel
281
282 instance FromJSON Tagger
283 instance ToJSON Tagger
284
285 instance FromJSON Sort
286 instance ToJSON Sort
287
288 instance FromJSON Order
289 instance ToJSON Order
290
291 instance FromJSON Filter
292 instance ToJSON Filter
293
294 instance FromJSON SynchronyScope
295 instance ToJSON SynchronyScope
296
297 instance FromJSON SynchronyStrategy
298 instance ToJSON SynchronyStrategy
299
300 instance FromJSON Synchrony
301 instance ToJSON Synchrony
302
303 instance FromJSON Quality
304 instance ToJSON Quality
305
306
307 -- | Software parameters
308 data Software =
309 Software { _software_name :: Text
310 , _software_version :: Text
311 } deriving (Generic, Show, Eq)
312
313 instance ToSchema Software where
314 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
315
316
317
318 defaultSoftware :: Software
319 defaultSoftware =
320 Software { _software_name = pack "GarganText"
321 , _software_version = pack "v5" }
322
323
324 -- | Global parameters of a Phylo
325 data PhyloParam =
326 PhyloParam { _phyloParam_version :: Text
327 , _phyloParam_software :: Software
328 , _phyloParam_config :: PhyloConfig
329 } deriving (Generic, Show, Eq)
330
331 instance ToSchema PhyloParam where
332 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
333
334
335
336 defaultPhyloParam :: PhyloParam
337 defaultPhyloParam =
338 PhyloParam { _phyloParam_version = pack "v3"
339 , _phyloParam_software = defaultSoftware
340 , _phyloParam_config = defaultConfig }
341
342
343 ------------------
344 -- | Document | --
345 ------------------
346
347 -- | Date : a simple Integer
348 type Date = Int
349
350 -- | DateStr : the string version of a Date
351 type DateStr = Text
352
353 -- | Ngrams : a contiguous sequence of n terms
354 type Ngrams = Text
355
356 -- Document : a piece of Text linked to a Date
357 -- date = computational date; date' = original string date yyyy-mm-dd
358 -- Export Database to Document
359 data Document = Document
360 { date :: Date -- datatype Date {unDate :: Int}
361 , date' :: DateStr -- show date
362 , text :: [Ngrams]
363 , weight :: Maybe Double
364 , sources :: [Text]
365 , docTime :: TimeUnit
366 } deriving (Eq,Show,Generic,NFData)
367
368
369 --------------------
370 -- | Foundation | --
371 --------------------
372
373
374 -- | The Foundations of a Phylo created from a given TermList
375 data PhyloFoundations = PhyloFoundations
376 { _foundations_roots :: (Vector Ngrams)
377 , _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
378 } deriving (Generic, Show, Eq)
379
380 data PhyloCounts = PhyloCounts
381 { coocByDate :: !(Map Date Cooc)
382 , docsByDate :: !(Map Date Double)
383 , rootsCountByDate :: !(Map Date (Map Int Double))
384 , rootsCount :: !(Map Int Double)
385 , rootsFreq :: !(Map Int Double)
386 , lastRootsFreq :: !(Map Int Double)
387 } deriving (Generic, Show, Eq)
388
389 data PhyloSources = PhyloSources
390 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
391
392 instance ToSchema PhyloFoundations where
393 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
394 instance ToSchema PhyloCounts where
395 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
396 instance ToSchema PhyloSources where
397 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
398
399 ---------------------------
400 -- | Coocurency Matrix | --
401 ---------------------------
402
403
404 -- | Cooc : a coocurency matrix between two ngrams
405 type Cooc = Map (Int,Int) Double
406
407
408 -------------------
409 -- | Phylomemy | --
410 -------------------
411
412 -- | Period : a tuple of Dates
413 type Period = (Date,Date)
414
415 -- | PeriodStr : a tuple of DateStr
416 type PeriodStr = (DateStr,DateStr)
417
418
419
420
421 -- | Phylo datatype of a phylomemy
422 -- foundations : the foundations of the phylo
423 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
424 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
425 -- param : the parameters of the phylomemy (with the user's configuration)
426 -- periods : the temporal steps of a phylomemy
427 data Phylo =
428 Phylo { _phylo_foundations :: PhyloFoundations
429 , _phylo_sources :: PhyloSources
430 , _phylo_counts :: PhyloCounts
431 , _phylo_seaLadder :: [Double]
432 , _phylo_param :: PhyloParam
433 , _phylo_periods :: Map Period PhyloPeriod
434 , _phylo_quality :: Double
435 , _phylo_level :: Double
436 }
437 deriving (Generic, Show, Eq)
438
439 instance ToSchema Phylo where
440 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
441
442
443 ----------------
444 -- | Period | --
445 ----------------
446
447 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
448 -- id: tuple (start date, end date) of the temporal step of the phylomemy
449 -- scales: scales of synchronic description
450 data PhyloPeriod =
451 PhyloPeriod { _phylo_periodPeriod :: Period
452 , _phylo_periodPeriodStr :: PeriodStr
453 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
454 } deriving (Generic, Show, Eq)
455
456 instance ToSchema PhyloPeriod where
457 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
458
459 ---------------
460 -- | Scale | --
461 ---------------
462
463 -- | Scale : a scale of synchronic description
464 type Scale = Int
465
466 -- | PhyloScaleId : the id of a scale of synchronic description
467 type PhyloScaleId = (Period,Scale)
468
469 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
470 data PhyloScale =
471 PhyloScale { _phylo_scalePeriod :: Period
472 , _phylo_scalePeriodStr :: PeriodStr
473 , _phylo_scaleScale :: Scale
474 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
475 }
476 deriving (Generic, Show, Eq)
477
478 instance ToSchema PhyloScale where
479 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
480
481
482 type PhyloGroupId = (PhyloScaleId, Int)
483
484 -- | BranchId : (a scale, a sequence of branch index)
485 -- the sequence is a path of heritage from the most to the less specific branch
486 type PhyloBranchId = (Scale, [Int])
487
488 -- | PhyloGroup : group of ngrams at each scale and period
489 data PhyloGroup =
490 PhyloGroup { _phylo_groupPeriod :: Period
491 , _phylo_groupPeriod' :: (Text,Text)
492 , _phylo_groupScale :: Scale
493 , _phylo_groupIndex :: Int
494 , _phylo_groupLabel :: Text
495 , _phylo_groupSupport :: Support
496 , _phylo_groupWeight :: Maybe Double
497 , _phylo_groupSources :: [Int]
498 , _phylo_groupNgrams :: [Int]
499 , _phylo_groupCooc :: !(Cooc)
500 , _phylo_groupDensity :: Double
501 , _phylo_groupBranchId :: PhyloBranchId
502 , _phylo_groupMeta :: Map Text [Double]
503 , _phylo_groupRootsCount :: Map Int Double
504 , _phylo_groupScaleParents :: [Pointer]
505 , _phylo_groupScaleChilds :: [Pointer]
506 , _phylo_groupPeriodParents :: [Pointer]
507 , _phylo_groupPeriodChilds :: [Pointer]
508 , _phylo_groupAncestors :: [Pointer]
509 , _phylo_groupPeriodMemoryParents :: [Pointer']
510 , _phylo_groupPeriodMemoryChilds :: [Pointer']
511 }
512 deriving (Generic, Show, Eq, NFData, Ord)
513
514 instance ToSchema PhyloGroup where
515 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
516
517
518 -- | Weight : A generic mesure that can be associated with an Id
519 type Weight = Double
520 type Thr = Double
521
522 -- | Pointer : A weighted pointer to a given PhyloGroup
523 type Pointer = (PhyloGroupId, Weight)
524 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
525 type Pointer' = (PhyloGroupId, (Thr,Weight))
526
527 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
528 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
529
530
531 --------------------------
532 -- | Phylo Clustering | --
533 --------------------------
534
535 -- | Support : Number of Documents where a Cluster occurs
536 type Support = Int
537
538 data Clustering = Clustering
539 { _clustering_roots :: [Int]
540 , _clustering_support :: Support
541 , _clustering_period :: Period
542 -- additional materials for visualization
543 , _clustering_visWeighting :: Maybe Double
544 , _clustering_visFiltering :: [Int]
545 } deriving (Generic,NFData,Show,Eq)
546
547 ----------------
548 -- | Export | --
549 ----------------
550
551 type DotId = TextLazy.Text
552
553 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
554
555 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
556 instance ToSchema Filter where
557 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
558
559
560 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
561
562 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
563 instance ToSchema Sort where
564 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
565
566
567 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
568 instance ToSchema Tagger where
569 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
570
571
572 data PhyloLabel =
573 BranchLabel
574 { _branch_labelTagger :: Tagger
575 , _branch_labelSize :: Int }
576 | GroupLabel
577 { _group_labelTagger :: Tagger
578 , _group_labelSize :: Int }
579 deriving (Show,Generic,Eq)
580
581 instance ToSchema PhyloLabel where
582 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
583
584
585 data PhyloBranch =
586 PhyloBranch
587 { _branch_id :: PhyloBranchId
588 , _branch_canonId :: [Int]
589 , _branch_seaLevel :: [Double]
590 , _branch_x :: Double
591 , _branch_y :: Double
592 , _branch_w :: Double
593 , _branch_t :: Double
594 , _branch_label :: Text
595 , _branch_meta :: Map Text [Double]
596 } deriving (Generic, Show, Eq)
597
598 instance ToSchema PhyloBranch where
599 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
600
601 data PhyloExport =
602 PhyloExport
603 { _export_groups :: [PhyloGroup]
604 , _export_branches :: [PhyloBranch]
605 } deriving (Generic, Show)
606 instance ToSchema PhyloExport where
607 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
608
609
610 ----------------
611 -- | Lenses | --
612 ----------------
613
614 makeLenses ''PhyloConfig
615 makeLenses ''PhyloSubConfigAPI
616 makeLenses ''PhyloSimilarity
617 makeLenses ''SeaElevation
618 makeLenses ''Quality
619 makeLenses ''Cluster
620 makeLenses ''PhyloLabel
621 makeLenses ''TimeUnit
622 makeLenses ''PhyloFoundations
623 makeLenses ''Clustering
624 makeLenses ''Phylo
625 makeLenses ''PhyloPeriod
626 makeLenses ''PhyloScale
627 makeLenses ''PhyloGroup
628 makeLenses ''PhyloParam
629 makeLenses ''PhyloExport
630 makeLenses ''PhyloBranch
631
632 ------------------------
633 -- | JSON instances | --
634 ------------------------
635
636 instance FromJSON Phylo
637 instance ToJSON Phylo
638
639 instance FromJSON PhyloSources
640 instance ToJSON PhyloSources
641
642 instance FromJSON PhyloParam
643 instance ToJSON PhyloParam
644
645 instance FromJSON PhyloCounts
646 instance ToJSON PhyloCounts
647
648 instance FromJSON PhyloPeriod
649 instance ToJSON PhyloPeriod
650
651 instance FromJSON PhyloScale
652 instance ToJSON PhyloScale
653
654 instance FromJSON Software
655 instance ToJSON Software
656
657 instance FromJSON PhyloGroup
658 instance ToJSON PhyloGroup
659
660 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)