]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
Merge branch 'dev' into 571-dev-node-corpus-api-search-fixes-take-2
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Utils.Prefix (unPrefix)
40 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
41 import Gargantext.Prelude
42 import qualified Data.Text.Lazy as TextLazy
43
44 ----------------
45 -- | PhyloConfig | --
46 ----------------
47
48 data CorpusParser =
49 Wos {_wos_limit :: Int}
50 | Csv {_csv_limit :: Int}
51 | Csv' {_csv'_limit :: Int}
52 deriving (Show,Generic,Eq)
53
54 instance ToSchema CorpusParser where
55 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
56
57
58 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
59 instance ToSchema ListParser
60
61
62 data SeaElevation =
63 Constante
64 { _cons_start :: Double
65 , _cons_gap :: Double }
66 | Adaptative
67 { _adap_steps :: Double }
68 | Evolving
69 { _evol_neighborhood :: Bool }
70 deriving (Show,Generic,Eq)
71
72 instance ToSchema SeaElevation
73
74 data PhyloSimilarity =
75 WeightedLogJaccard
76 { _wlj_sensibility :: Double
77 , _wlj_minSharedNgrams :: Int }
78 | WeightedLogSim
79 { _wls_sensibility :: Double
80 , _wls_minSharedNgrams :: Int }
81 | Hamming
82 { _hmg_sensibility :: Double
83 , _hmg_minSharedNgrams :: Int}
84
85 deriving (Show,Generic,Eq)
86
87 instance ToSchema PhyloSimilarity where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
89
90
91 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
92 deriving (Show,Generic,Eq, ToSchema)
93
94 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
95 deriving (Show,Generic,Eq)
96
97 instance ToSchema SynchronyStrategy where
98 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
99
100
101 data Synchrony =
102 ByProximityThreshold
103 { _bpt_threshold :: Double
104 , _bpt_sensibility :: Double
105 , _bpt_scope :: SynchronyScope
106 , _bpt_strategy :: SynchronyStrategy }
107 | ByProximityDistribution
108 { _bpd_sensibility :: Double
109 , _bpd_strategy :: SynchronyStrategy }
110 deriving (Show,Generic,Eq)
111
112 instance ToSchema Synchrony where
113 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
114
115
116
117 data TimeUnit =
118 Epoch
119 { _epoch_period :: Int
120 , _epoch_step :: Int
121 , _epoch_matchingFrame :: Int }
122 | Year
123 { _year_period :: Int
124 , _year_step :: Int
125 , _year_matchingFrame :: Int }
126 | Month
127 { _month_period :: Int
128 , _month_step :: Int
129 , _month_matchingFrame :: Int }
130 | Week
131 { _week_period :: Int
132 , _week_step :: Int
133 , _week_matchingFrame :: Int }
134 | Day
135 { _day_period :: Int
136 , _day_step :: Int
137 , _day_matchingFrame :: Int }
138 deriving (Show,Generic,Eq,NFData)
139
140 instance ToSchema TimeUnit where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
142
143
144 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
145
146 instance ToSchema MaxCliqueFilter where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
148
149
150
151 data Cluster =
152 Fis
153 { _fis_support :: Int
154 , _fis_size :: Int }
155 | MaxClique
156 { _mcl_size :: Int
157 , _mcl_threshold :: Double
158 , _mcl_filter :: MaxCliqueFilter }
159 deriving (Show,Generic,Eq)
160
161 instance ToSchema Cluster where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
163
164
165 data Quality =
166 Quality { _qua_granularity :: Double
167 , _qua_minBranch :: Int }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Quality where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
172
173
174 data PhyloConfig =
175 PhyloConfig { corpusPath :: FilePath
176 , listPath :: FilePath
177 , outputPath :: FilePath
178 , corpusParser :: CorpusParser
179 , listParser :: ListParser
180 , phyloName :: Text
181 , phyloScale :: Int
182 , similarity :: PhyloSimilarity
183 , seaElevation :: SeaElevation
184 , defaultMode :: Bool
185 , findAncestors :: Bool
186 , phyloSynchrony :: Synchrony
187 , phyloQuality :: Quality
188 , timeUnit :: TimeUnit
189 , clique :: Cluster
190 , exportLabel :: [PhyloLabel]
191 , exportSort :: Sort
192 , exportFilter :: [Filter]
193 } deriving (Show,Generic,Eq)
194
195
196 ------------------------------------------------------------------------
197 data PhyloSubConfig =
198 PhyloSubConfig { _sc_phyloProximity :: Double
199 , _sc_phyloSynchrony :: Double
200 , _sc_phyloQuality :: Double
201 , _sc_timeUnit :: TimeUnit
202 , _sc_clique :: Cluster
203 , _sc_exportFilter :: Double
204 , _sc_defaultMode :: Bool
205 }
206 deriving (Show,Generic,Eq)
207
208
209 subConfig2config :: PhyloSubConfig -> PhyloConfig
210 subConfig2config subConfig = defaultConfig { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
211 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
212 , phyloQuality = Quality (_sc_phyloQuality subConfig) 1
213 , timeUnit = _sc_timeUnit subConfig
214 , clique = _sc_clique subConfig
215 , defaultMode = _sc_defaultMode subConfig
216 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
217 }
218
219 ------------------------------------------------------------------------
220 defaultConfig :: PhyloConfig
221 defaultConfig =
222 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
223 , listPath = "list.csv" -- useful for commandline only
224 , outputPath = "data/"
225 , corpusParser = Csv 100000
226 , listParser = V4
227 , phyloName = pack "Phylo Name"
228 , phyloScale = 2
229 , similarity = WeightedLogJaccard 0.5 1
230 , seaElevation = Constante 0.1 0.1
231 , defaultMode = False
232 , findAncestors = False
233 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
234 , phyloQuality = Quality 0.3 1
235 , timeUnit = Year 3 1 5
236 , clique = Fis 3 1 -- MaxClique 5 0.0001 ByThreshold
237 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
238 , exportSort = ByHierarchy Desc
239 , exportFilter = [ByBranchSize 3]
240 }
241
242 -- Main Instances
243 instance ToSchema PhyloConfig
244 instance ToSchema PhyloSubConfig
245
246 instance FromJSON PhyloConfig
247 instance ToJSON PhyloConfig
248
249 instance FromJSON PhyloSubConfig
250 instance ToJSON PhyloSubConfig
251
252 instance FromJSON CorpusParser
253 instance ToJSON CorpusParser
254
255 instance FromJSON ListParser
256 instance ToJSON ListParser
257
258 instance FromJSON PhyloSimilarity
259 instance ToJSON PhyloSimilarity
260
261 instance FromJSON SeaElevation
262 instance ToJSON SeaElevation
263
264 instance FromJSON TimeUnit
265 instance ToJSON TimeUnit
266
267 instance FromJSON MaxCliqueFilter
268 instance ToJSON MaxCliqueFilter
269
270 instance FromJSON Cluster
271 instance ToJSON Cluster
272
273 instance FromJSON PhyloLabel
274 instance ToJSON PhyloLabel
275
276 instance FromJSON Tagger
277 instance ToJSON Tagger
278
279 instance FromJSON Sort
280 instance ToJSON Sort
281
282 instance FromJSON Order
283 instance ToJSON Order
284
285 instance FromJSON Filter
286 instance ToJSON Filter
287
288 instance FromJSON SynchronyScope
289 instance ToJSON SynchronyScope
290
291 instance FromJSON SynchronyStrategy
292 instance ToJSON SynchronyStrategy
293
294 instance FromJSON Synchrony
295 instance ToJSON Synchrony
296
297 instance FromJSON Quality
298 instance ToJSON Quality
299
300
301 -- | Software parameters
302 data Software =
303 Software { _software_name :: Text
304 , _software_version :: Text
305 } deriving (Generic, Show, Eq)
306
307 instance ToSchema Software where
308 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
309
310
311
312 defaultSoftware :: Software
313 defaultSoftware =
314 Software { _software_name = pack "GarganText"
315 , _software_version = pack "v5" }
316
317
318 -- | Global parameters of a Phylo
319 data PhyloParam =
320 PhyloParam { _phyloParam_version :: Text
321 , _phyloParam_software :: Software
322 , _phyloParam_config :: PhyloConfig
323 } deriving (Generic, Show, Eq)
324
325 instance ToSchema PhyloParam where
326 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
327
328
329
330 defaultPhyloParam :: PhyloParam
331 defaultPhyloParam =
332 PhyloParam { _phyloParam_version = pack "v3"
333 , _phyloParam_software = defaultSoftware
334 , _phyloParam_config = defaultConfig }
335
336
337 ------------------
338 -- | Document | --
339 ------------------
340
341 -- | Date : a simple Integer
342 type Date = Int
343
344 -- | DateStr : the string version of a Date
345 type DateStr = Text
346
347 -- | Ngrams : a contiguous sequence of n terms
348 type Ngrams = Text
349
350 -- Document : a piece of Text linked to a Date
351 -- date = computational date; date' = original string date yyyy-mm-dd
352 -- Export Database to Document
353 data Document = Document
354 { date :: Date -- datatype Date {unDate :: Int}
355 , date' :: DateStr -- show date
356 , text :: [Ngrams]
357 , weight :: Maybe Double
358 , sources :: [Text]
359 , docTime :: TimeUnit
360 } deriving (Eq,Show,Generic,NFData)
361
362
363 --------------------
364 -- | Foundation | --
365 --------------------
366
367
368 -- | The Foundations of a Phylo created from a given TermList
369 data PhyloFoundations = PhyloFoundations
370 { _foundations_roots :: (Vector Ngrams)
371 , _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
372 } deriving (Generic, Show, Eq)
373
374 data PhyloCounts = PhyloCounts
375 { coocByDate :: !(Map Date Cooc)
376 , docsByDate :: !(Map Date Double)
377 , rootsCountByDate :: !(Map Date (Map Int Double))
378 , rootsCount :: !(Map Int Double)
379 , rootsFreq :: !(Map Int Double)
380 , lastRootsFreq :: !(Map Int Double)
381 } deriving (Generic, Show, Eq)
382
383 data PhyloSources = PhyloSources
384 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
385
386 instance ToSchema PhyloFoundations where
387 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
388 instance ToSchema PhyloCounts where
389 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
390 instance ToSchema PhyloSources where
391 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
392
393 ---------------------------
394 -- | Coocurency Matrix | --
395 ---------------------------
396
397
398 -- | Cooc : a coocurency matrix between two ngrams
399 type Cooc = Map (Int,Int) Double
400
401
402 -------------------
403 -- | Phylomemy | --
404 -------------------
405
406 -- | Period : a tuple of Dates
407 type Period = (Date,Date)
408
409 -- | PeriodStr : a tuple of DateStr
410 type PeriodStr = (DateStr,DateStr)
411
412
413
414
415 -- | Phylo datatype of a phylomemy
416 -- foundations : the foundations of the phylo
417 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
418 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
419 -- param : the parameters of the phylomemy (with the user's configuration)
420 -- periods : the temporal steps of a phylomemy
421 data Phylo =
422 Phylo { _phylo_foundations :: PhyloFoundations
423 , _phylo_sources :: PhyloSources
424 , _phylo_counts :: PhyloCounts
425 , _phylo_seaLadder :: [Double]
426 , _phylo_param :: PhyloParam
427 , _phylo_periods :: Map Period PhyloPeriod
428 , _phylo_quality :: Double
429 , _phylo_level :: Double
430 }
431 deriving (Generic, Show, Eq)
432
433 instance ToSchema Phylo where
434 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
435
436 ----------------
437 -- | Period | --
438 ----------------
439
440 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
441 -- id: tuple (start date, end date) of the temporal step of the phylomemy
442 -- scales: scales of synchronic description
443 data PhyloPeriod =
444 PhyloPeriod { _phylo_periodPeriod :: Period
445 , _phylo_periodPeriodStr :: PeriodStr
446 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
447 } deriving (Generic, Show, Eq)
448
449 instance ToSchema PhyloPeriod where
450 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
451
452 ---------------
453 -- | Scale | --
454 ---------------
455
456 -- | Scale : a scale of synchronic description
457 type Scale = Int
458
459 -- | PhyloScaleId : the id of a scale of synchronic description
460 type PhyloScaleId = (Period,Scale)
461
462 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
463 data PhyloScale =
464 PhyloScale { _phylo_scalePeriod :: Period
465 , _phylo_scalePeriodStr :: PeriodStr
466 , _phylo_scaleScale :: Scale
467 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
468 }
469 deriving (Generic, Show, Eq)
470
471 instance ToSchema PhyloScale where
472 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
473
474
475 type PhyloGroupId = (PhyloScaleId, Int)
476
477 -- | BranchId : (a scale, a sequence of branch index)
478 -- the sequence is a path of heritage from the most to the less specific branch
479 type PhyloBranchId = (Scale, [Int])
480
481 -- | PhyloGroup : group of ngrams at each scale and period
482 data PhyloGroup =
483 PhyloGroup { _phylo_groupPeriod :: Period
484 , _phylo_groupPeriod' :: (Text,Text)
485 , _phylo_groupScale :: Scale
486 , _phylo_groupIndex :: Int
487 , _phylo_groupLabel :: Text
488 , _phylo_groupSupport :: Support
489 , _phylo_groupWeight :: Maybe Double
490 , _phylo_groupSources :: [Int]
491 , _phylo_groupNgrams :: [Int]
492 , _phylo_groupCooc :: !(Cooc)
493 , _phylo_groupDensity :: Double
494 , _phylo_groupBranchId :: PhyloBranchId
495 , _phylo_groupMeta :: Map Text [Double]
496 , _phylo_groupRootsCount :: Map Int Double
497 , _phylo_groupScaleParents :: [Pointer]
498 , _phylo_groupScaleChilds :: [Pointer]
499 , _phylo_groupPeriodParents :: [Pointer]
500 , _phylo_groupPeriodChilds :: [Pointer]
501 , _phylo_groupAncestors :: [Pointer]
502 , _phylo_groupPeriodMemoryParents :: [Pointer']
503 , _phylo_groupPeriodMemoryChilds :: [Pointer']
504 }
505 deriving (Generic, Show, Eq, NFData)
506
507 instance ToSchema PhyloGroup where
508 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
509
510
511 -- | Weight : A generic mesure that can be associated with an Id
512 type Weight = Double
513 type Thr = Double
514
515 -- | Pointer : A weighted pointer to a given PhyloGroup
516 type Pointer = (PhyloGroupId, Weight)
517 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
518 type Pointer' = (PhyloGroupId, (Thr,Weight))
519
520 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
521 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
522
523
524 --------------------------
525 -- | Phylo Clustering | --
526 --------------------------
527
528 -- | Support : Number of Documents where a Cluster occurs
529 type Support = Int
530
531 data Clustering = Clustering
532 { _clustering_roots :: [Int]
533 , _clustering_support :: Support
534 , _clustering_period :: Period
535 -- additional materials for visualization
536 , _clustering_visWeighting :: Maybe Double
537 , _clustering_visFiltering :: [Int]
538 } deriving (Generic,NFData,Show,Eq)
539
540 ----------------
541 -- | Export | --
542 ----------------
543
544 type DotId = TextLazy.Text
545
546 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
547
548 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
549 instance ToSchema Filter where
550 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
551
552
553 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
554
555 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
556 instance ToSchema Sort where
557 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
558
559
560 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
561 instance ToSchema Tagger where
562 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
563
564
565 data PhyloLabel =
566 BranchLabel
567 { _branch_labelTagger :: Tagger
568 , _branch_labelSize :: Int }
569 | GroupLabel
570 { _group_labelTagger :: Tagger
571 , _group_labelSize :: Int }
572 deriving (Show,Generic,Eq)
573
574 instance ToSchema PhyloLabel where
575 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
576
577
578 data PhyloBranch =
579 PhyloBranch
580 { _branch_id :: PhyloBranchId
581 , _branch_canonId :: [Int]
582 , _branch_seaLevel :: [Double]
583 , _branch_x :: Double
584 , _branch_y :: Double
585 , _branch_w :: Double
586 , _branch_t :: Double
587 , _branch_label :: Text
588 , _branch_meta :: Map Text [Double]
589 } deriving (Generic, Show, Eq)
590
591 instance ToSchema PhyloBranch where
592 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
593
594 data PhyloExport =
595 PhyloExport
596 { _export_groups :: [PhyloGroup]
597 , _export_branches :: [PhyloBranch]
598 } deriving (Generic, Show)
599 instance ToSchema PhyloExport where
600 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
601
602
603 ----------------
604 -- | Lenses | --
605 ----------------
606
607 makeLenses ''PhyloConfig
608 makeLenses ''PhyloSubConfig
609 makeLenses ''PhyloSimilarity
610 makeLenses ''SeaElevation
611 makeLenses ''Quality
612 makeLenses ''Cluster
613 makeLenses ''PhyloLabel
614 makeLenses ''TimeUnit
615 makeLenses ''PhyloFoundations
616 makeLenses ''Clustering
617 makeLenses ''Phylo
618 makeLenses ''PhyloPeriod
619 makeLenses ''PhyloScale
620 makeLenses ''PhyloGroup
621 makeLenses ''PhyloParam
622 makeLenses ''PhyloExport
623 makeLenses ''PhyloBranch
624
625 ------------------------
626 -- | JSON instances | --
627 ------------------------
628
629 instance FromJSON Phylo
630 instance ToJSON Phylo
631
632 instance FromJSON PhyloSources
633 instance ToJSON PhyloSources
634
635 instance FromJSON PhyloParam
636 instance ToJSON PhyloParam
637
638 instance FromJSON PhyloCounts
639 instance ToJSON PhyloCounts
640
641 instance FromJSON PhyloPeriod
642 instance ToJSON PhyloPeriod
643
644 instance FromJSON PhyloScale
645 instance ToJSON PhyloScale
646
647 instance FromJSON Software
648 instance ToJSON Software
649
650 instance FromJSON PhyloGroup
651 instance ToJSON PhyloGroup
652
653 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)