]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
[VERSION] +1 to 0.0.6.9.6
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Text.Context (TermList)
40 import Gargantext.Core.Utils.Prefix (unPrefix)
41 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
42 import Gargantext.Prelude
43 import qualified Data.Text.Lazy as TextLazy
44
45 ----------------
46 -- | PhyloConfig | --
47 ----------------
48
49 data CorpusParser =
50 Wos {_wos_limit :: Int}
51 | Csv {_csv_limit :: Int}
52 | Csv' {_csv'_limit :: Int}
53 deriving (Show,Generic,Eq)
54
55 instance ToSchema CorpusParser where
56 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
57
58
59 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
60 instance ToSchema ListParser
61
62
63 data SeaElevation =
64 Constante
65 { _cons_start :: Double
66 , _cons_step :: Double }
67 | Adaptative
68 { _adap_granularity :: Double }
69 deriving (Show,Generic,Eq)
70
71 instance ToSchema SeaElevation
72
73 data Proximity =
74 WeightedLogJaccard
75 { _wlj_sensibility :: Double
76 , _wlj_minSharedNgrams :: Int }
77 | WeightedLogSim
78 { _wls_sensibility :: Double
79 , _wls_minSharedNgrams :: Int }
80 | Hamming
81 { _hmg_sensibility :: Double
82 , _hmg_minSharedNgrams :: Int}
83
84 deriving (Show,Generic,Eq)
85
86 instance ToSchema Proximity where
87 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
88
89
90 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
91 deriving (Show,Generic,Eq, ToSchema)
92
93 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
94 deriving (Show,Generic,Eq)
95
96 instance ToSchema SynchronyStrategy where
97 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
98
99
100 data Synchrony =
101 ByProximityThreshold
102 { _bpt_threshold :: Double
103 , _bpt_sensibility :: Double
104 , _bpt_scope :: SynchronyScope
105 , _bpt_strategy :: SynchronyStrategy }
106 | ByProximityDistribution
107 { _bpd_sensibility :: Double
108 , _bpd_strategy :: SynchronyStrategy }
109 deriving (Show,Generic,Eq)
110
111 instance ToSchema Synchrony where
112 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
113
114
115
116 data TimeUnit =
117 Epoch
118 { _epoch_period :: Int
119 , _epoch_step :: Int
120 , _epoch_matchingFrame :: Int }
121 | Year
122 { _year_period :: Int
123 , _year_step :: Int
124 , _year_matchingFrame :: Int }
125 | Month
126 { _month_period :: Int
127 , _month_step :: Int
128 , _month_matchingFrame :: Int }
129 | Week
130 { _week_period :: Int
131 , _week_step :: Int
132 , _week_matchingFrame :: Int }
133 | Day
134 { _day_period :: Int
135 , _day_step :: Int
136 , _day_matchingFrame :: Int }
137 deriving (Show,Generic,Eq)
138
139 instance ToSchema TimeUnit where
140 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
141
142
143 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
144
145 instance ToSchema MaxCliqueFilter where
146 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
147
148
149
150 data Cluster =
151 Fis
152 { _fis_support :: Int
153 , _fis_size :: Int }
154 | MaxClique
155 { _mcl_size :: Int
156 , _mcl_threshold :: Double
157 , _mcl_filter :: MaxCliqueFilter }
158 deriving (Show,Generic,Eq)
159
160 instance ToSchema Cluster where
161 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
162
163
164 data Quality =
165 Quality { _qua_granularity :: Double
166 , _qua_minBranch :: Int }
167 deriving (Show,Generic,Eq)
168
169 instance ToSchema Quality where
170 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
171
172
173 data PhyloConfig =
174 PhyloConfig { corpusPath :: FilePath
175 , listPath :: FilePath
176 , outputPath :: FilePath
177 , corpusParser :: CorpusParser
178 , listParser :: ListParser
179 , phyloName :: Text
180 , phyloScale :: Int
181 , phyloProximity :: Proximity
182 , seaElevation :: SeaElevation
183 , findAncestors :: Bool
184 , phyloSynchrony :: Synchrony
185 , phyloQuality :: Quality
186 , timeUnit :: TimeUnit
187 , clique :: Cluster
188 , exportLabel :: [PhyloLabel]
189 , exportSort :: Sort
190 , exportFilter :: [Filter]
191 } deriving (Show,Generic,Eq)
192
193
194 ------------------------------------------------------------------------
195 data PhyloSubConfig =
196 PhyloSubConfig { _sc_phyloProximity :: Double
197 , _sc_phyloSynchrony :: Double
198 , _sc_phyloQuality :: Double
199 , _sc_timeUnit :: TimeUnit
200 , _sc_clique :: Cluster
201 , _sc_exportFilter :: Double
202 }
203 deriving (Show,Generic,Eq)
204
205
206 subConfig2config :: PhyloSubConfig -> PhyloConfig
207 subConfig2config subConfig = defaultConfig { phyloProximity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
208 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
209 , phyloQuality = Quality (_sc_phyloQuality subConfig) 1
210 , timeUnit = _sc_timeUnit subConfig
211 , clique = _sc_clique subConfig
212 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
213 }
214
215 ------------------------------------------------------------------------
216 defaultConfig :: PhyloConfig
217 defaultConfig =
218 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
219 , listPath = "list.csv" -- useful for commandline only
220 , outputPath = "data/"
221 , corpusParser = Csv 100000
222 , listParser = V4
223 , phyloName = pack "Phylo Name"
224 , phyloScale = 2
225 , phyloProximity = WeightedLogJaccard 0.5 1
226 , seaElevation = Constante 0.1 0.1
227 , findAncestors = False
228 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
229 , phyloQuality = Quality 0.5 1
230 , timeUnit = Year 3 1 5
231 , clique = MaxClique 5 0.0001 ByThreshold
232 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
233 , exportSort = ByHierarchy Desc
234 , exportFilter = [ByBranchSize 3]
235 }
236
237 -- Main Instances
238 instance ToSchema PhyloConfig
239 instance ToSchema PhyloSubConfig
240
241 instance FromJSON PhyloConfig
242 instance ToJSON PhyloConfig
243
244 instance FromJSON PhyloSubConfig
245 instance ToJSON PhyloSubConfig
246
247 instance FromJSON CorpusParser
248 instance ToJSON CorpusParser
249
250 instance FromJSON ListParser
251 instance ToJSON ListParser
252
253 instance FromJSON Proximity
254 instance ToJSON Proximity
255
256 instance FromJSON SeaElevation
257 instance ToJSON SeaElevation
258
259 instance FromJSON TimeUnit
260 instance ToJSON TimeUnit
261
262 instance FromJSON MaxCliqueFilter
263 instance ToJSON MaxCliqueFilter
264
265 instance FromJSON Cluster
266 instance ToJSON Cluster
267
268 instance FromJSON PhyloLabel
269 instance ToJSON PhyloLabel
270
271 instance FromJSON Tagger
272 instance ToJSON Tagger
273
274 instance FromJSON Sort
275 instance ToJSON Sort
276
277 instance FromJSON Order
278 instance ToJSON Order
279
280 instance FromJSON Filter
281 instance ToJSON Filter
282
283 instance FromJSON SynchronyScope
284 instance ToJSON SynchronyScope
285
286 instance FromJSON SynchronyStrategy
287 instance ToJSON SynchronyStrategy
288
289 instance FromJSON Synchrony
290 instance ToJSON Synchrony
291
292 instance FromJSON Quality
293 instance ToJSON Quality
294
295
296 -- | Software parameters
297 data Software =
298 Software { _software_name :: Text
299 , _software_version :: Text
300 } deriving (Generic, Show, Eq)
301
302 instance ToSchema Software where
303 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
304
305
306
307 defaultSoftware :: Software
308 defaultSoftware =
309 Software { _software_name = pack "Gargantext"
310 , _software_version = pack "v4" }
311
312
313 -- | Global parameters of a Phylo
314 data PhyloParam =
315 PhyloParam { _phyloParam_version :: Text
316 , _phyloParam_software :: Software
317 , _phyloParam_config :: PhyloConfig
318 } deriving (Generic, Show, Eq)
319
320 instance ToSchema PhyloParam where
321 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
322
323
324
325 defaultPhyloParam :: PhyloParam
326 defaultPhyloParam =
327 PhyloParam { _phyloParam_version = pack "v2.adaptative"
328 , _phyloParam_software = defaultSoftware
329 , _phyloParam_config = defaultConfig }
330
331
332 ------------------
333 -- | Document | --
334 ------------------
335
336 -- | Date : a simple Integer
337 type Date = Int
338
339 -- | DateStr : the string version of a Date
340 type DateStr = Text
341
342 -- | Ngrams : a contiguous sequence of n terms
343 type Ngrams = Text
344
345 -- Document : a piece of Text linked to a Date
346 -- date = computational date; date' = original string date yyyy-mm-dd
347 -- Export Database to Document
348 data Document = Document
349 { date :: Date -- datatype Date {unDate :: Int}
350 , date' :: DateStr -- show date
351 , text :: [Ngrams]
352 , weight :: Maybe Double
353 , sources :: [Text]
354 } deriving (Eq,Show,Generic,NFData)
355
356
357 --------------------
358 -- | Foundation | --
359 --------------------
360
361
362 -- | The Foundations of a Phylo created from a given TermList
363 data PhyloFoundations = PhyloFoundations
364 { _foundations_roots :: !(Vector Ngrams)
365 , _foundations_mapList :: TermList
366 } deriving (Generic, Show, Eq)
367
368 instance ToSchema PhyloFoundations where
369 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
370
371
372
373 data PhyloSources = PhyloSources
374 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
375
376 instance ToSchema PhyloSources where
377 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
378
379 ---------------------------
380 -- | Coocurency Matrix | --
381 ---------------------------
382
383
384 -- | Cooc : a coocurency matrix between two ngrams
385 type Cooc = Map (Int,Int) Double
386
387
388 -------------------
389 -- | Phylomemy | --
390 -------------------
391
392 -- | Period : a tuple of Dates
393 type Period = (Date,Date)
394
395 -- | PeriodStr : a tuple of DateStr
396 type PeriodStr = (DateStr,DateStr)
397
398
399 -- | Phylo datatype of a phylomemy
400 -- foundations : the foundations of the phylo
401 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
402 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
403 -- param : the parameters of the phylomemy (with the user's configuration)
404 -- periods : the temporal steps of a phylomemy
405 data Phylo =
406 Phylo { _phylo_foundations :: PhyloFoundations
407 , _phylo_sources :: PhyloSources
408 , _phylo_timeCooc :: !(Map Date Cooc)
409 , _phylo_timeDocs :: !(Map Date Double)
410 , _phylo_termFreq :: !(Map Int Double)
411 , _phylo_lastTermFreq :: !(Map Int Double)
412 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
413 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
414 , _phylo_param :: PhyloParam
415 , _phylo_periods :: Map Period PhyloPeriod
416 , _phylo_quality :: Double
417 }
418 deriving (Generic, Show, Eq)
419
420 instance ToSchema Phylo where
421 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
422
423
424 ----------------
425 -- | Period | --
426 ----------------
427
428 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
429 -- id: tuple (start date, end date) of the temporal step of the phylomemy
430 -- scales: scales of synchronic description
431 data PhyloPeriod =
432 PhyloPeriod { _phylo_periodPeriod :: Period
433 , _phylo_periodPeriodStr :: PeriodStr
434 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
435 } deriving (Generic, Show, Eq)
436
437 instance ToSchema PhyloPeriod where
438 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
439
440 ---------------
441 -- | Scale | --
442 ---------------
443
444 -- | Scale : a scale of synchronic description
445 type Scale = Int
446
447 -- | PhyloScaleId : the id of a scale of synchronic description
448 type PhyloScaleId = (Period,Scale)
449
450 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
451 data PhyloScale =
452 PhyloScale { _phylo_scalePeriod :: Period
453 , _phylo_scalePeriodStr :: PeriodStr
454 , _phylo_scaleScale :: Scale
455 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
456 }
457 deriving (Generic, Show, Eq)
458
459 instance ToSchema PhyloScale where
460 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
461
462
463 type PhyloGroupId = (PhyloScaleId, Int)
464
465 -- | BranchId : (a scale, a sequence of branch index)
466 -- the sequence is a path of heritage from the most to the less specific branch
467 type PhyloBranchId = (Scale, [Int])
468
469 -- | PhyloGroup : group of ngrams at each scale and period
470 data PhyloGroup =
471 PhyloGroup { _phylo_groupPeriod :: Period
472 , _phylo_groupPeriod' :: (Text,Text)
473 , _phylo_groupScale :: Scale
474 , _phylo_groupIndex :: Int
475 , _phylo_groupLabel :: Text
476 , _phylo_groupSupport :: Support
477 , _phylo_groupWeight :: Maybe Double
478 , _phylo_groupSources :: [Int]
479 , _phylo_groupNgrams :: [Int]
480 , _phylo_groupCooc :: !(Cooc)
481 , _phylo_groupBranchId :: PhyloBranchId
482 , _phylo_groupMeta :: Map Text [Double]
483 , _phylo_groupScaleParents :: [Pointer]
484 , _phylo_groupScaleChilds :: [Pointer]
485 , _phylo_groupPeriodParents :: [Pointer]
486 , _phylo_groupPeriodChilds :: [Pointer]
487 , _phylo_groupAncestors :: [Pointer]
488 , _phylo_groupPeriodMemoryParents :: [Pointer']
489 , _phylo_groupPeriodMemoryChilds :: [Pointer']
490 }
491 deriving (Generic, Show, Eq, NFData)
492
493 instance ToSchema PhyloGroup where
494 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
495
496
497 -- | Weight : A generic mesure that can be associated with an Id
498 type Weight = Double
499 type Thr = Double
500
501 -- | Pointer : A weighted pointer to a given PhyloGroup
502 type Pointer = (PhyloGroupId, Weight)
503 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
504 type Pointer' = (PhyloGroupId, (Thr,Weight))
505
506 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
507 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
508
509
510 --------------------------
511 -- | Phylo Clustering | --
512 --------------------------
513
514 -- | Support : Number of Documents where a Cluster occurs
515 type Support = Int
516
517 data Clustering = Clustering
518 { _clustering_roots :: [Int]
519 , _clustering_support :: Support
520 , _clustering_period :: Period
521 -- additional materials for visualization
522 , _clustering_visWeighting :: Maybe Double
523 , _clustering_visFiltering :: [Int]
524 } deriving (Generic,NFData,Show,Eq)
525
526 ----------------
527 -- | Export | --
528 ----------------
529
530 type DotId = TextLazy.Text
531
532 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
533
534 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
535 instance ToSchema Filter where
536 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
537
538
539 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
540
541 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
542 instance ToSchema Sort where
543 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
544
545
546 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
547 instance ToSchema Tagger where
548 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
549
550
551 data PhyloLabel =
552 BranchLabel
553 { _branch_labelTagger :: Tagger
554 , _branch_labelSize :: Int }
555 | GroupLabel
556 { _group_labelTagger :: Tagger
557 , _group_labelSize :: Int }
558 deriving (Show,Generic,Eq)
559
560 instance ToSchema PhyloLabel where
561 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
562
563
564 data PhyloBranch =
565 PhyloBranch
566 { _branch_id :: PhyloBranchId
567 , _branch_canonId :: [Int]
568 , _branch_seaLevel :: [Double]
569 , _branch_x :: Double
570 , _branch_y :: Double
571 , _branch_w :: Double
572 , _branch_t :: Double
573 , _branch_label :: Text
574 , _branch_meta :: Map Text [Double]
575 } deriving (Generic, Show, Eq)
576
577 instance ToSchema PhyloBranch where
578 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
579
580 data PhyloExport =
581 PhyloExport
582 { _export_groups :: [PhyloGroup]
583 , _export_branches :: [PhyloBranch]
584 } deriving (Generic, Show)
585 instance ToSchema PhyloExport where
586 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
587
588
589 ----------------
590 -- | Lenses | --
591 ----------------
592
593 makeLenses ''PhyloConfig
594 makeLenses ''PhyloSubConfig
595 makeLenses ''Proximity
596 makeLenses ''SeaElevation
597 makeLenses ''Quality
598 makeLenses ''Cluster
599 makeLenses ''PhyloLabel
600 makeLenses ''TimeUnit
601 makeLenses ''PhyloFoundations
602 makeLenses ''Clustering
603 makeLenses ''Phylo
604 makeLenses ''PhyloPeriod
605 makeLenses ''PhyloScale
606 makeLenses ''PhyloGroup
607 makeLenses ''PhyloParam
608 makeLenses ''PhyloExport
609 makeLenses ''PhyloBranch
610
611 ------------------------
612 -- | JSON instances | --
613 ------------------------
614
615 instance FromJSON Phylo
616 instance ToJSON Phylo
617
618 instance FromJSON PhyloSources
619 instance ToJSON PhyloSources
620
621 instance FromJSON PhyloParam
622 instance ToJSON PhyloParam
623
624 instance FromJSON PhyloPeriod
625 instance ToJSON PhyloPeriod
626
627 instance FromJSON PhyloScale
628 instance ToJSON PhyloScale
629
630 instance FromJSON Software
631 instance ToJSON Software
632
633 instance FromJSON PhyloGroup
634 instance ToJSON PhyloGroup
635
636 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)