]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
doc debug
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Utils.Prefix (unPrefix)
40 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
41 import Gargantext.Prelude
42 import qualified Data.Text.Lazy as TextLazy
43
44 ----------------
45 -- | PhyloConfig | --
46 ----------------
47
48 data CorpusParser =
49 Wos {_wos_limit :: Int}
50 | Csv {_csv_limit :: Int}
51 | Csv' {_csv'_limit :: Int}
52 deriving (Show,Generic,Eq)
53
54 instance ToSchema CorpusParser where
55 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
56
57
58 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
59 instance ToSchema ListParser
60
61
62 data SeaElevation =
63 Constante
64 { _cons_start :: Double
65 , _cons_gap :: Double }
66 | Adaptative
67 { _adap_steps :: Double }
68 | Evolving
69 { _evol_neighborhood :: Bool }
70 deriving (Show,Generic,Eq)
71
72 instance ToSchema SeaElevation
73
74 data PhyloSimilarity =
75 WeightedLogJaccard
76 { _wlj_sensibility :: Double
77 , _wlj_minSharedNgrams :: Int }
78 | WeightedLogSim
79 { _wls_sensibility :: Double
80 , _wls_minSharedNgrams :: Int }
81 | Hamming
82 { _hmg_sensibility :: Double
83 , _hmg_minSharedNgrams :: Int}
84
85 deriving (Show,Generic,Eq)
86
87 instance ToSchema PhyloSimilarity where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
89
90
91 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
92 deriving (Show,Generic,Eq, ToSchema)
93
94 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
95 deriving (Show,Generic,Eq)
96
97 instance ToSchema SynchronyStrategy where
98 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
99
100
101 data Synchrony =
102 ByProximityThreshold
103 { _bpt_threshold :: Double
104 , _bpt_sensibility :: Double
105 , _bpt_scope :: SynchronyScope
106 , _bpt_strategy :: SynchronyStrategy }
107 | ByProximityDistribution
108 { _bpd_sensibility :: Double
109 , _bpd_strategy :: SynchronyStrategy }
110 deriving (Show,Generic,Eq)
111
112 instance ToSchema Synchrony where
113 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
114
115
116
117 data TimeUnit =
118 Epoch
119 { _epoch_period :: Int
120 , _epoch_step :: Int
121 , _epoch_matchingFrame :: Int }
122 | Year
123 { _year_period :: Int
124 , _year_step :: Int
125 , _year_matchingFrame :: Int }
126 | Month
127 { _month_period :: Int
128 , _month_step :: Int
129 , _month_matchingFrame :: Int }
130 | Week
131 { _week_period :: Int
132 , _week_step :: Int
133 , _week_matchingFrame :: Int }
134 | Day
135 { _day_period :: Int
136 , _day_step :: Int
137 , _day_matchingFrame :: Int }
138 deriving (Show,Generic,Eq)
139
140 instance ToSchema TimeUnit where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
142
143
144 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
145
146 instance ToSchema MaxCliqueFilter where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
148
149
150
151 data Cluster =
152 Fis
153 { _fis_support :: Int
154 , _fis_size :: Int }
155 | MaxClique
156 { _mcl_size :: Int
157 , _mcl_threshold :: Double
158 , _mcl_filter :: MaxCliqueFilter }
159 deriving (Show,Generic,Eq)
160
161 instance ToSchema Cluster where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
163
164
165 data Quality =
166 Quality { _qua_granularity :: Double
167 , _qua_minBranch :: Int }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Quality where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
172
173
174 data PhyloConfig =
175 PhyloConfig { corpusPath :: FilePath
176 , listPath :: FilePath
177 , outputPath :: FilePath
178 , corpusParser :: CorpusParser
179 , listParser :: ListParser
180 , phyloName :: Text
181 , phyloScale :: Int
182 , similarity :: PhyloSimilarity
183 , seaElevation :: SeaElevation
184 , defaultMode :: Bool
185 , findAncestors :: Bool
186 , phyloSynchrony :: Synchrony
187 , phyloQuality :: Quality
188 , timeUnit :: TimeUnit
189 , clique :: Cluster
190 , exportLabel :: [PhyloLabel]
191 , exportSort :: Sort
192 , exportFilter :: [Filter]
193 } deriving (Show,Generic,Eq)
194
195
196 ------------------------------------------------------------------------
197 data PhyloSubConfig =
198 PhyloSubConfig { _sc_phyloProximity :: Double
199 , _sc_phyloSynchrony :: Double
200 , _sc_phyloQuality :: Double
201 , _sc_timeUnit :: TimeUnit
202 , _sc_clique :: Cluster
203 , _sc_exportFilter :: Double
204 , _sc_defaultMode :: Bool
205 }
206 deriving (Show,Generic,Eq)
207
208
209 subConfig2config :: PhyloSubConfig -> PhyloConfig
210 subConfig2config subConfig = defaultConfig { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
211 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
212 , phyloQuality = Quality (_sc_phyloQuality subConfig) 1
213 , timeUnit = _sc_timeUnit subConfig
214 , clique = _sc_clique subConfig
215 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
216 }
217
218 ------------------------------------------------------------------------
219 defaultConfig :: PhyloConfig
220 defaultConfig =
221 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
222 , listPath = "list.csv" -- useful for commandline only
223 , outputPath = "data/"
224 , corpusParser = Csv 100000
225 , listParser = V4
226 , phyloName = pack "Phylo Name"
227 , phyloScale = 2
228 , similarity = WeightedLogJaccard 0.5 1
229 , seaElevation = Constante 0.1 0.1
230 , defaultMode = True
231 , findAncestors = False
232 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
233 , phyloQuality = Quality 0.5 1
234 , timeUnit = Year 3 1 5
235 , clique = MaxClique 5 0.0001 ByThreshold
236 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
237 , exportSort = ByHierarchy Desc
238 , exportFilter = [ByBranchSize 3]
239 }
240
241 -- Main Instances
242 instance ToSchema PhyloConfig
243 instance ToSchema PhyloSubConfig
244
245 instance FromJSON PhyloConfig
246 instance ToJSON PhyloConfig
247
248 instance FromJSON PhyloSubConfig
249 instance ToJSON PhyloSubConfig
250
251 instance FromJSON CorpusParser
252 instance ToJSON CorpusParser
253
254 instance FromJSON ListParser
255 instance ToJSON ListParser
256
257 instance FromJSON PhyloSimilarity
258 instance ToJSON PhyloSimilarity
259
260 instance FromJSON SeaElevation
261 instance ToJSON SeaElevation
262
263 instance FromJSON TimeUnit
264 instance ToJSON TimeUnit
265
266 instance FromJSON MaxCliqueFilter
267 instance ToJSON MaxCliqueFilter
268
269 instance FromJSON Cluster
270 instance ToJSON Cluster
271
272 instance FromJSON PhyloLabel
273 instance ToJSON PhyloLabel
274
275 instance FromJSON Tagger
276 instance ToJSON Tagger
277
278 instance FromJSON Sort
279 instance ToJSON Sort
280
281 instance FromJSON Order
282 instance ToJSON Order
283
284 instance FromJSON Filter
285 instance ToJSON Filter
286
287 instance FromJSON SynchronyScope
288 instance ToJSON SynchronyScope
289
290 instance FromJSON SynchronyStrategy
291 instance ToJSON SynchronyStrategy
292
293 instance FromJSON Synchrony
294 instance ToJSON Synchrony
295
296 instance FromJSON Quality
297 instance ToJSON Quality
298
299
300 -- | Software parameters
301 data Software =
302 Software { _software_name :: Text
303 , _software_version :: Text
304 } deriving (Generic, Show, Eq)
305
306 instance ToSchema Software where
307 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
308
309
310
311 defaultSoftware :: Software
312 defaultSoftware =
313 Software { _software_name = pack "GarganText"
314 , _software_version = pack "v5" }
315
316
317 -- | Global parameters of a Phylo
318 data PhyloParam =
319 PhyloParam { _phyloParam_version :: Text
320 , _phyloParam_software :: Software
321 , _phyloParam_config :: PhyloConfig
322 } deriving (Generic, Show, Eq)
323
324 instance ToSchema PhyloParam where
325 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
326
327
328
329 defaultPhyloParam :: PhyloParam
330 defaultPhyloParam =
331 PhyloParam { _phyloParam_version = pack "v3"
332 , _phyloParam_software = defaultSoftware
333 , _phyloParam_config = defaultConfig }
334
335
336 ------------------
337 -- | Document | --
338 ------------------
339
340 -- | Date : a simple Integer
341 type Date = Int
342
343 -- | DateStr : the string version of a Date
344 type DateStr = Text
345
346 -- | Ngrams : a contiguous sequence of n terms
347 type Ngrams = Text
348
349 -- Document : a piece of Text linked to a Date
350 -- date = computational date; date' = original string date yyyy-mm-dd
351 -- Export Database to Document
352 data Document = Document
353 { date :: Date -- datatype Date {unDate :: Int}
354 , date' :: DateStr -- show date
355 , text :: [Ngrams]
356 , weight :: Maybe Double
357 , sources :: [Text]
358 } deriving (Eq,Show,Generic,NFData)
359
360
361 --------------------
362 -- | Foundation | --
363 --------------------
364
365
366 -- | The Foundations of a Phylo created from a given TermList
367 data PhyloFoundations = PhyloFoundations
368 { _foundations_roots :: (Vector Ngrams)
369 , _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
370 } deriving (Generic, Show, Eq)
371
372 data PhyloCounts = PhyloCounts
373 { coocByDate :: !(Map Date Cooc)
374 , docsByDate :: !(Map Date Double)
375 , rootsCount :: !(Map Int Double)
376 , rootsFreq :: !(Map Int Double)
377 , lastRootsFreq :: !(Map Int Double)
378 } deriving (Generic, Show, Eq)
379
380 data PhyloSources = PhyloSources
381 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
382
383 instance ToSchema PhyloFoundations where
384 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
385 instance ToSchema PhyloCounts where
386 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
387 instance ToSchema PhyloSources where
388 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
389
390 ---------------------------
391 -- | Coocurency Matrix | --
392 ---------------------------
393
394
395 -- | Cooc : a coocurency matrix between two ngrams
396 type Cooc = Map (Int,Int) Double
397
398
399 -------------------
400 -- | Phylomemy | --
401 -------------------
402
403 -- | Period : a tuple of Dates
404 type Period = (Date,Date)
405
406 -- | PeriodStr : a tuple of DateStr
407 type PeriodStr = (DateStr,DateStr)
408
409
410
411
412 -- | Phylo datatype of a phylomemy
413 -- foundations : the foundations of the phylo
414 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
415 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
416 -- param : the parameters of the phylomemy (with the user's configuration)
417 -- periods : the temporal steps of a phylomemy
418 data Phylo =
419 Phylo { _phylo_foundations :: PhyloFoundations
420 , _phylo_sources :: PhyloSources
421 , _phylo_counts :: PhyloCounts
422 , _phylo_seaLadder :: [Double]
423 , _phylo_param :: PhyloParam
424 , _phylo_periods :: Map Period PhyloPeriod
425 , _phylo_quality :: Double
426 , _phylo_level :: Double
427 }
428 deriving (Generic, Show, Eq)
429
430 instance ToSchema Phylo where
431 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
432
433
434 ----------------
435 -- | Period | --
436 ----------------
437
438 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
439 -- id: tuple (start date, end date) of the temporal step of the phylomemy
440 -- scales: scales of synchronic description
441 data PhyloPeriod =
442 PhyloPeriod { _phylo_periodPeriod :: Period
443 , _phylo_periodPeriodStr :: PeriodStr
444 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
445 } deriving (Generic, Show, Eq)
446
447 instance ToSchema PhyloPeriod where
448 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
449
450 ---------------
451 -- | Scale | --
452 ---------------
453
454 -- | Scale : a scale of synchronic description
455 type Scale = Int
456
457 -- | PhyloScaleId : the id of a scale of synchronic description
458 type PhyloScaleId = (Period,Scale)
459
460 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
461 data PhyloScale =
462 PhyloScale { _phylo_scalePeriod :: Period
463 , _phylo_scalePeriodStr :: PeriodStr
464 , _phylo_scaleScale :: Scale
465 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
466 }
467 deriving (Generic, Show, Eq)
468
469 instance ToSchema PhyloScale where
470 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
471
472
473 type PhyloGroupId = (PhyloScaleId, Int)
474
475 -- | BranchId : (a scale, a sequence of branch index)
476 -- the sequence is a path of heritage from the most to the less specific branch
477 type PhyloBranchId = (Scale, [Int])
478
479 -- | PhyloGroup : group of ngrams at each scale and period
480 data PhyloGroup =
481 PhyloGroup { _phylo_groupPeriod :: Period
482 , _phylo_groupPeriod' :: (Text,Text)
483 , _phylo_groupScale :: Scale
484 , _phylo_groupIndex :: Int
485 , _phylo_groupLabel :: Text
486 , _phylo_groupSupport :: Support
487 , _phylo_groupWeight :: Maybe Double
488 , _phylo_groupSources :: [Int]
489 , _phylo_groupNgrams :: [Int]
490 , _phylo_groupCooc :: !(Cooc)
491 , _phylo_groupBranchId :: PhyloBranchId
492 , _phylo_groupMeta :: Map Text [Double]
493 , _phylo_groupScaleParents :: [Pointer]
494 , _phylo_groupScaleChilds :: [Pointer]
495 , _phylo_groupPeriodParents :: [Pointer]
496 , _phylo_groupPeriodChilds :: [Pointer]
497 , _phylo_groupAncestors :: [Pointer]
498 , _phylo_groupPeriodMemoryParents :: [Pointer']
499 , _phylo_groupPeriodMemoryChilds :: [Pointer']
500 }
501 deriving (Generic, Show, Eq, NFData)
502
503 instance ToSchema PhyloGroup where
504 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
505
506
507 -- | Weight : A generic mesure that can be associated with an Id
508 type Weight = Double
509 type Thr = Double
510
511 -- | Pointer : A weighted pointer to a given PhyloGroup
512 type Pointer = (PhyloGroupId, Weight)
513 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
514 type Pointer' = (PhyloGroupId, (Thr,Weight))
515
516 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
517 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
518
519
520 --------------------------
521 -- | Phylo Clustering | --
522 --------------------------
523
524 -- | Support : Number of Documents where a Cluster occurs
525 type Support = Int
526
527 data Clustering = Clustering
528 { _clustering_roots :: [Int]
529 , _clustering_support :: Support
530 , _clustering_period :: Period
531 -- additional materials for visualization
532 , _clustering_visWeighting :: Maybe Double
533 , _clustering_visFiltering :: [Int]
534 } deriving (Generic,NFData,Show,Eq)
535
536 ----------------
537 -- | Export | --
538 ----------------
539
540 type DotId = TextLazy.Text
541
542 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
543
544 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
545 instance ToSchema Filter where
546 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
547
548
549 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
550
551 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
552 instance ToSchema Sort where
553 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
554
555
556 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
557 instance ToSchema Tagger where
558 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
559
560
561 data PhyloLabel =
562 BranchLabel
563 { _branch_labelTagger :: Tagger
564 , _branch_labelSize :: Int }
565 | GroupLabel
566 { _group_labelTagger :: Tagger
567 , _group_labelSize :: Int }
568 deriving (Show,Generic,Eq)
569
570 instance ToSchema PhyloLabel where
571 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
572
573
574 data PhyloBranch =
575 PhyloBranch
576 { _branch_id :: PhyloBranchId
577 , _branch_canonId :: [Int]
578 , _branch_seaLevel :: [Double]
579 , _branch_x :: Double
580 , _branch_y :: Double
581 , _branch_w :: Double
582 , _branch_t :: Double
583 , _branch_label :: Text
584 , _branch_meta :: Map Text [Double]
585 } deriving (Generic, Show, Eq)
586
587 instance ToSchema PhyloBranch where
588 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
589
590 data PhyloExport =
591 PhyloExport
592 { _export_groups :: [PhyloGroup]
593 , _export_branches :: [PhyloBranch]
594 } deriving (Generic, Show)
595 instance ToSchema PhyloExport where
596 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
597
598
599 ----------------
600 -- | Lenses | --
601 ----------------
602
603 makeLenses ''PhyloConfig
604 makeLenses ''PhyloSubConfig
605 makeLenses ''PhyloSimilarity
606 makeLenses ''SeaElevation
607 makeLenses ''Quality
608 makeLenses ''Cluster
609 makeLenses ''PhyloLabel
610 makeLenses ''TimeUnit
611 makeLenses ''PhyloFoundations
612 makeLenses ''Clustering
613 makeLenses ''Phylo
614 makeLenses ''PhyloPeriod
615 makeLenses ''PhyloScale
616 makeLenses ''PhyloGroup
617 makeLenses ''PhyloParam
618 makeLenses ''PhyloExport
619 makeLenses ''PhyloBranch
620
621 ------------------------
622 -- | JSON instances | --
623 ------------------------
624
625 instance FromJSON Phylo
626 instance ToJSON Phylo
627
628 instance FromJSON PhyloSources
629 instance ToJSON PhyloSources
630
631 instance FromJSON PhyloParam
632 instance ToJSON PhyloParam
633
634 instance FromJSON PhyloCounts
635 instance ToJSON PhyloCounts
636
637 instance FromJSON PhyloPeriod
638 instance ToJSON PhyloPeriod
639
640 instance FromJSON PhyloScale
641 instance ToJSON PhyloScale
642
643 instance FromJSON Software
644 instance ToJSON Software
645
646 instance FromJSON PhyloGroup
647 instance ToJSON PhyloGroup
648
649 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)