]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
maybe fix the phylo issue
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Utils.Prefix (unPrefix)
40 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
41 import Gargantext.Prelude
42 import qualified Data.Text.Lazy as TextLazy
43
44 ---------------------
45 -- | PhyloConfig | --
46 ---------------------
47
48 data CorpusParser =
49 Wos {_wos_limit :: Int}
50 | Csv {_csv_limit :: Int}
51 | Csv' {_csv'_limit :: Int}
52 deriving (Show,Generic,Eq)
53
54 instance ToSchema CorpusParser where
55 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
56
57
58 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
59 instance ToSchema ListParser
60
61
62 data SeaElevation =
63 Constante
64 { _cons_start :: Double
65 , _cons_gap :: Double }
66 | Adaptative
67 { _adap_steps :: Double }
68 | Evolving
69 { _evol_neighborhood :: Bool }
70 deriving (Show,Generic,Eq)
71
72 instance ToSchema SeaElevation
73
74 data PhyloSimilarity =
75 WeightedLogJaccard
76 { _wlj_sensibility :: Double
77 , _wlj_minSharedNgrams :: Int }
78 | WeightedLogSim
79 { _wls_sensibility :: Double
80 , _wls_minSharedNgrams :: Int }
81 | Hamming
82 { _hmg_sensibility :: Double
83 , _hmg_minSharedNgrams :: Int}
84
85 deriving (Show,Generic,Eq)
86
87 instance ToSchema PhyloSimilarity where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
89
90
91 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
92 deriving (Show,Generic,Eq, ToSchema)
93
94 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
95 deriving (Show,Generic,Eq)
96
97 instance ToSchema SynchronyStrategy where
98 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
99
100
101 data Synchrony =
102 ByProximityThreshold
103 { _bpt_threshold :: Double
104 , _bpt_sensibility :: Double
105 , _bpt_scope :: SynchronyScope
106 , _bpt_strategy :: SynchronyStrategy }
107 | ByProximityDistribution
108 { _bpd_sensibility :: Double
109 , _bpd_strategy :: SynchronyStrategy }
110 deriving (Show,Generic,Eq)
111
112 instance ToSchema Synchrony where
113 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
114
115
116
117 data TimeUnit =
118 Epoch
119 { _epoch_period :: Int
120 , _epoch_step :: Int
121 , _epoch_matchingFrame :: Int }
122 | Year
123 { _year_period :: Int
124 , _year_step :: Int
125 , _year_matchingFrame :: Int }
126 | Month
127 { _month_period :: Int
128 , _month_step :: Int
129 , _month_matchingFrame :: Int }
130 | Week
131 { _week_period :: Int
132 , _week_step :: Int
133 , _week_matchingFrame :: Int }
134 | Day
135 { _day_period :: Int
136 , _day_step :: Int
137 , _day_matchingFrame :: Int }
138 deriving (Show,Generic,Eq,NFData)
139
140 instance ToSchema TimeUnit where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
142
143
144 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
145
146 instance ToSchema MaxCliqueFilter where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
148
149
150
151 data Cluster =
152 Fis
153 { _fis_support :: Int
154 , _fis_size :: Int }
155 | MaxClique
156 { _mcl_size :: Int
157 , _mcl_threshold :: Double
158 , _mcl_filter :: MaxCliqueFilter }
159 deriving (Show,Generic,Eq)
160
161 instance ToSchema Cluster where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
163
164
165 data Quality =
166 Quality { _qua_granularity :: Double
167 , _qua_minBranch :: Int }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Quality where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
172
173
174 data PhyloConfig =
175 PhyloConfig { corpusPath :: FilePath
176 , listPath :: FilePath
177 , outputPath :: FilePath
178 , corpusParser :: CorpusParser
179 , listParser :: ListParser
180 , phyloName :: Text
181 , phyloScale :: Int
182 , similarity :: PhyloSimilarity
183 , seaElevation :: SeaElevation
184 , defaultMode :: Bool
185 , findAncestors :: Bool
186 , phyloSynchrony :: Synchrony
187 , phyloQuality :: Quality
188 , timeUnit :: TimeUnit
189 , clique :: Cluster
190 , exportLabel :: [PhyloLabel]
191 , exportSort :: Sort
192 , exportFilter :: [Filter]
193 } deriving (Show,Generic,Eq)
194
195
196 --------------------------------
197 -- | SubConfig API & 1Click | --
198 --------------------------------
199
200 data PhyloSubConfigAPI =
201 PhyloSubConfigAPI { _sc_phyloProximity :: Double
202 , _sc_phyloSynchrony :: Double
203 , _sc_phyloQuality :: Double
204 , _sc_timeUnit :: TimeUnit
205 , _sc_clique :: Cluster
206 , _sc_exportFilter :: Double
207 } deriving (Show,Generic,Eq)
208
209
210 subConfigAPI2config :: PhyloSubConfigAPI -> PhyloConfig
211 subConfigAPI2config subConfig = defaultConfig
212 { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 2
213 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
214 , phyloQuality = Quality (_sc_phyloQuality subConfig) 3
215 , timeUnit = _sc_timeUnit subConfig
216 , clique = _sc_clique subConfig
217 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
218 }
219
220 --------------------------
221 -- | SubConfig 1Click | --
222 --------------------------
223
224 defaultConfig :: PhyloConfig
225 defaultConfig =
226 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
227 , listPath = "list.csv" -- useful for commandline only
228 , outputPath = "data/"
229 , corpusParser = Csv 150000
230 , listParser = V4
231 , phyloName = pack "Phylo Name"
232 , phyloScale = 2
233 , similarity = WeightedLogJaccard 0.5 2
234 , seaElevation = Constante 0.1 0.1
235 , defaultMode = False
236 , findAncestors = True
237 , phyloSynchrony = ByProximityThreshold 0.6 0 AllBranches MergeAllGroups
238 , phyloQuality = Quality 0.5 3
239 , timeUnit = Year 3 1 5
240 , clique = Fis 2 3
241 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
242 , exportSort = ByHierarchy Desc
243 , exportFilter = [ByBranchSize 3]
244 }
245
246 -- Main Instances
247 instance ToSchema PhyloConfig
248 instance ToSchema PhyloSubConfigAPI
249
250 instance FromJSON PhyloConfig
251 instance ToJSON PhyloConfig
252
253 instance FromJSON PhyloSubConfigAPI
254 instance ToJSON PhyloSubConfigAPI
255
256 instance FromJSON CorpusParser
257 instance ToJSON CorpusParser
258
259 instance FromJSON ListParser
260 instance ToJSON ListParser
261
262 instance FromJSON PhyloSimilarity
263 instance ToJSON PhyloSimilarity
264
265 instance FromJSON SeaElevation
266 instance ToJSON SeaElevation
267
268 instance FromJSON TimeUnit
269 instance ToJSON TimeUnit
270
271 instance FromJSON MaxCliqueFilter
272 instance ToJSON MaxCliqueFilter
273
274 instance FromJSON Cluster
275 instance ToJSON Cluster
276
277 instance FromJSON PhyloLabel
278 instance ToJSON PhyloLabel
279
280 instance FromJSON Tagger
281 instance ToJSON Tagger
282
283 instance FromJSON Sort
284 instance ToJSON Sort
285
286 instance FromJSON Order
287 instance ToJSON Order
288
289 instance FromJSON Filter
290 instance ToJSON Filter
291
292 instance FromJSON SynchronyScope
293 instance ToJSON SynchronyScope
294
295 instance FromJSON SynchronyStrategy
296 instance ToJSON SynchronyStrategy
297
298 instance FromJSON Synchrony
299 instance ToJSON Synchrony
300
301 instance FromJSON Quality
302 instance ToJSON Quality
303
304
305 -- | Software parameters
306 data Software =
307 Software { _software_name :: Text
308 , _software_version :: Text
309 } deriving (Generic, Show, Eq)
310
311 instance ToSchema Software where
312 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
313
314
315
316 defaultSoftware :: Software
317 defaultSoftware =
318 Software { _software_name = pack "GarganText"
319 , _software_version = pack "v5" }
320
321
322 -- | Global parameters of a Phylo
323 data PhyloParam =
324 PhyloParam { _phyloParam_version :: Text
325 , _phyloParam_software :: Software
326 , _phyloParam_config :: PhyloConfig
327 } deriving (Generic, Show, Eq)
328
329 instance ToSchema PhyloParam where
330 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
331
332
333
334 defaultPhyloParam :: PhyloParam
335 defaultPhyloParam =
336 PhyloParam { _phyloParam_version = pack "v3"
337 , _phyloParam_software = defaultSoftware
338 , _phyloParam_config = defaultConfig }
339
340
341 ------------------
342 -- | Document | --
343 ------------------
344
345 -- | Date : a simple Integer
346 type Date = Int
347
348 -- | DateStr : the string version of a Date
349 type DateStr = Text
350
351 -- | Ngrams : a contiguous sequence of n terms
352 type Ngrams = Text
353
354 -- Document : a piece of Text linked to a Date
355 -- date = computational date; date' = original string date yyyy-mm-dd
356 -- Export Database to Document
357 data Document = Document
358 { date :: Date -- datatype Date {unDate :: Int}
359 , date' :: DateStr -- show date
360 , text :: [Ngrams]
361 , weight :: Maybe Double
362 , sources :: [Text]
363 , docTime :: TimeUnit
364 } deriving (Eq,Show,Generic,NFData)
365
366
367 --------------------
368 -- | Foundation | --
369 --------------------
370
371
372 -- | The Foundations of a Phylo created from a given TermList
373 data PhyloFoundations = PhyloFoundations
374 { _foundations_roots :: (Vector Ngrams)
375 , _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
376 } deriving (Generic, Show, Eq)
377
378 data PhyloCounts = PhyloCounts
379 { coocByDate :: !(Map Date Cooc)
380 , docsByDate :: !(Map Date Double)
381 , rootsCountByDate :: !(Map Date (Map Int Double))
382 , rootsCount :: !(Map Int Double)
383 , rootsFreq :: !(Map Int Double)
384 , lastRootsFreq :: !(Map Int Double)
385 } deriving (Generic, Show, Eq)
386
387 data PhyloSources = PhyloSources
388 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
389
390 instance ToSchema PhyloFoundations where
391 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
392 instance ToSchema PhyloCounts where
393 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
394 instance ToSchema PhyloSources where
395 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
396
397 ---------------------------
398 -- | Coocurency Matrix | --
399 ---------------------------
400
401
402 -- | Cooc : a coocurency matrix between two ngrams
403 type Cooc = Map (Int,Int) Double
404
405
406 -------------------
407 -- | Phylomemy | --
408 -------------------
409
410 -- | Period : a tuple of Dates
411 type Period = (Date,Date)
412
413 -- | PeriodStr : a tuple of DateStr
414 type PeriodStr = (DateStr,DateStr)
415
416
417
418
419 -- | Phylo datatype of a phylomemy
420 -- foundations : the foundations of the phylo
421 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
422 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
423 -- param : the parameters of the phylomemy (with the user's configuration)
424 -- periods : the temporal steps of a phylomemy
425 data Phylo =
426 Phylo { _phylo_foundations :: PhyloFoundations
427 , _phylo_sources :: PhyloSources
428 , _phylo_counts :: PhyloCounts
429 , _phylo_seaLadder :: [Double]
430 , _phylo_param :: PhyloParam
431 , _phylo_periods :: Map Period PhyloPeriod
432 , _phylo_quality :: Double
433 , _phylo_level :: Double
434 }
435 deriving (Generic, Show, Eq)
436
437 instance ToSchema Phylo where
438 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
439
440
441 ----------------
442 -- | Period | --
443 ----------------
444
445 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
446 -- id: tuple (start date, end date) of the temporal step of the phylomemy
447 -- scales: scales of synchronic description
448 data PhyloPeriod =
449 PhyloPeriod { _phylo_periodPeriod :: Period
450 , _phylo_periodPeriodStr :: PeriodStr
451 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
452 } deriving (Generic, Show, Eq)
453
454 instance ToSchema PhyloPeriod where
455 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
456
457 ---------------
458 -- | Scale | --
459 ---------------
460
461 -- | Scale : a scale of synchronic description
462 type Scale = Int
463
464 -- | PhyloScaleId : the id of a scale of synchronic description
465 type PhyloScaleId = (Period,Scale)
466
467 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
468 data PhyloScale =
469 PhyloScale { _phylo_scalePeriod :: Period
470 , _phylo_scalePeriodStr :: PeriodStr
471 , _phylo_scaleScale :: Scale
472 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
473 }
474 deriving (Generic, Show, Eq)
475
476 instance ToSchema PhyloScale where
477 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
478
479
480 type PhyloGroupId = (PhyloScaleId, Int)
481
482 -- | BranchId : (a scale, a sequence of branch index)
483 -- the sequence is a path of heritage from the most to the less specific branch
484 type PhyloBranchId = (Scale, [Int])
485
486 -- | PhyloGroup : group of ngrams at each scale and period
487 data PhyloGroup =
488 PhyloGroup { _phylo_groupPeriod :: Period
489 , _phylo_groupPeriod' :: (Text,Text)
490 , _phylo_groupScale :: Scale
491 , _phylo_groupIndex :: Int
492 , _phylo_groupLabel :: Text
493 , _phylo_groupSupport :: Support
494 , _phylo_groupWeight :: Maybe Double
495 , _phylo_groupSources :: [Int]
496 , _phylo_groupNgrams :: [Int]
497 , _phylo_groupCooc :: !(Cooc)
498 , _phylo_groupDensity :: Double
499 , _phylo_groupBranchId :: PhyloBranchId
500 , _phylo_groupMeta :: Map Text [Double]
501 , _phylo_groupRootsCount :: Map Int Double
502 , _phylo_groupScaleParents :: [Pointer]
503 , _phylo_groupScaleChilds :: [Pointer]
504 , _phylo_groupPeriodParents :: [Pointer]
505 , _phylo_groupPeriodChilds :: [Pointer]
506 , _phylo_groupAncestors :: [Pointer]
507 , _phylo_groupPeriodMemoryParents :: [Pointer']
508 , _phylo_groupPeriodMemoryChilds :: [Pointer']
509 }
510 deriving (Generic, Show, Eq, NFData)
511
512 instance ToSchema PhyloGroup where
513 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
514
515
516 -- | Weight : A generic mesure that can be associated with an Id
517 type Weight = Double
518 type Thr = Double
519
520 -- | Pointer : A weighted pointer to a given PhyloGroup
521 type Pointer = (PhyloGroupId, Weight)
522 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
523 type Pointer' = (PhyloGroupId, (Thr,Weight))
524
525 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
526 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
527
528
529 --------------------------
530 -- | Phylo Clustering | --
531 --------------------------
532
533 -- | Support : Number of Documents where a Cluster occurs
534 type Support = Int
535
536 data Clustering = Clustering
537 { _clustering_roots :: [Int]
538 , _clustering_support :: Support
539 , _clustering_period :: Period
540 -- additional materials for visualization
541 , _clustering_visWeighting :: Maybe Double
542 , _clustering_visFiltering :: [Int]
543 } deriving (Generic,NFData,Show,Eq)
544
545 ----------------
546 -- | Export | --
547 ----------------
548
549 type DotId = TextLazy.Text
550
551 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
552
553 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
554 instance ToSchema Filter where
555 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
556
557
558 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
559
560 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
561 instance ToSchema Sort where
562 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
563
564
565 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
566 instance ToSchema Tagger where
567 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
568
569
570 data PhyloLabel =
571 BranchLabel
572 { _branch_labelTagger :: Tagger
573 , _branch_labelSize :: Int }
574 | GroupLabel
575 { _group_labelTagger :: Tagger
576 , _group_labelSize :: Int }
577 deriving (Show,Generic,Eq)
578
579 instance ToSchema PhyloLabel where
580 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
581
582
583 data PhyloBranch =
584 PhyloBranch
585 { _branch_id :: PhyloBranchId
586 , _branch_canonId :: [Int]
587 , _branch_seaLevel :: [Double]
588 , _branch_x :: Double
589 , _branch_y :: Double
590 , _branch_w :: Double
591 , _branch_t :: Double
592 , _branch_label :: Text
593 , _branch_meta :: Map Text [Double]
594 } deriving (Generic, Show, Eq)
595
596 instance ToSchema PhyloBranch where
597 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
598
599 data PhyloExport =
600 PhyloExport
601 { _export_groups :: [PhyloGroup]
602 , _export_branches :: [PhyloBranch]
603 } deriving (Generic, Show)
604 instance ToSchema PhyloExport where
605 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
606
607
608 ----------------
609 -- | Lenses | --
610 ----------------
611
612 makeLenses ''PhyloConfig
613 makeLenses ''PhyloSubConfigAPI
614 makeLenses ''PhyloSimilarity
615 makeLenses ''SeaElevation
616 makeLenses ''Quality
617 makeLenses ''Cluster
618 makeLenses ''PhyloLabel
619 makeLenses ''TimeUnit
620 makeLenses ''PhyloFoundations
621 makeLenses ''Clustering
622 makeLenses ''Phylo
623 makeLenses ''PhyloPeriod
624 makeLenses ''PhyloScale
625 makeLenses ''PhyloGroup
626 makeLenses ''PhyloParam
627 makeLenses ''PhyloExport
628 makeLenses ''PhyloBranch
629
630 ------------------------
631 -- | JSON instances | --
632 ------------------------
633
634 instance FromJSON Phylo
635 instance ToJSON Phylo
636
637 instance FromJSON PhyloSources
638 instance ToJSON PhyloSources
639
640 instance FromJSON PhyloParam
641 instance ToJSON PhyloParam
642
643 instance FromJSON PhyloCounts
644 instance ToJSON PhyloCounts
645
646 instance FromJSON PhyloPeriod
647 instance ToJSON PhyloPeriod
648
649 instance FromJSON PhyloScale
650 instance ToJSON PhyloScale
651
652 instance FromJSON Software
653 instance ToJSON Software
654
655 instance FromJSON PhyloGroup
656 instance ToJSON PhyloGroup
657
658 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)