]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
Merge remote-tracking branch 'origin/445-dev-doc-upload-lang' into dev
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Set (Set)
35 import Data.Swagger
36 import Data.Text (Text, pack)
37 import Data.Vector (Vector)
38 import GHC.Generics
39 import GHC.IO (FilePath)
40 import Gargantext.Core.Text.Context (TermList)
41 import Gargantext.Core.Utils.Prefix (unPrefix)
42 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
43 import Gargantext.Prelude
44 import qualified Data.Text.Lazy as TextLazy
45
46 ----------------
47 -- | PhyloConfig | --
48 ----------------
49
50 data CorpusParser =
51 Wos {_wos_limit :: Int}
52 | Csv {_csv_limit :: Int}
53 | Csv' {_csv'_limit :: Int}
54 deriving (Show,Generic,Eq)
55
56 instance ToSchema CorpusParser where
57 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
58
59
60 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
61 instance ToSchema ListParser
62
63
64 data SeaElevation =
65 Constante
66 { _cons_start :: Double
67 , _cons_gap :: Double }
68 | Adaptative
69 { _adap_steps :: Double }
70 deriving (Show,Generic,Eq)
71
72 instance ToSchema SeaElevation
73
74 data Proximity =
75 WeightedLogJaccard
76 { _wlj_sensibility :: Double
77 , _wlj_minSharedNgrams :: Int }
78 | WeightedLogSim
79 { _wls_sensibility :: Double
80 , _wls_minSharedNgrams :: Int }
81 | Hamming
82 { _hmg_sensibility :: Double
83 , _hmg_minSharedNgrams :: Int}
84
85 deriving (Show,Generic,Eq)
86
87 instance ToSchema Proximity where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
89
90
91 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
92 deriving (Show,Generic,Eq, ToSchema)
93
94 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
95 deriving (Show,Generic,Eq)
96
97 instance ToSchema SynchronyStrategy where
98 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
99
100
101 data Synchrony =
102 ByProximityThreshold
103 { _bpt_threshold :: Double
104 , _bpt_sensibility :: Double
105 , _bpt_scope :: SynchronyScope
106 , _bpt_strategy :: SynchronyStrategy }
107 | ByProximityDistribution
108 { _bpd_sensibility :: Double
109 , _bpd_strategy :: SynchronyStrategy }
110 deriving (Show,Generic,Eq)
111
112 instance ToSchema Synchrony where
113 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
114
115
116
117 data TimeUnit =
118 Epoch
119 { _epoch_period :: Int
120 , _epoch_step :: Int
121 , _epoch_matchingFrame :: Int }
122 | Year
123 { _year_period :: Int
124 , _year_step :: Int
125 , _year_matchingFrame :: Int }
126 | Month
127 { _month_period :: Int
128 , _month_step :: Int
129 , _month_matchingFrame :: Int }
130 | Week
131 { _week_period :: Int
132 , _week_step :: Int
133 , _week_matchingFrame :: Int }
134 | Day
135 { _day_period :: Int
136 , _day_step :: Int
137 , _day_matchingFrame :: Int }
138 deriving (Show,Generic,Eq)
139
140 instance ToSchema TimeUnit where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
142
143
144 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
145
146 instance ToSchema MaxCliqueFilter where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
148
149
150
151 data Cluster =
152 Fis
153 { _fis_support :: Int
154 , _fis_size :: Int }
155 | MaxClique
156 { _mcl_size :: Int
157 , _mcl_threshold :: Double
158 , _mcl_filter :: MaxCliqueFilter }
159 deriving (Show,Generic,Eq)
160
161 instance ToSchema Cluster where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
163
164
165 data Quality =
166 Quality { _qua_granularity :: Double
167 , _qua_minBranch :: Int }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Quality where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
172
173
174 data PhyloConfig =
175 PhyloConfig { corpusPath :: FilePath
176 , listPath :: FilePath
177 , outputPath :: FilePath
178 , corpusParser :: CorpusParser
179 , listParser :: ListParser
180 , phyloName :: Text
181 , phyloScale :: Int
182 , phyloProximity :: Proximity
183 , seaElevation :: SeaElevation
184 , findAncestors :: Bool
185 , phyloSynchrony :: Synchrony
186 , phyloQuality :: Quality
187 , timeUnit :: TimeUnit
188 , clique :: Cluster
189 , exportLabel :: [PhyloLabel]
190 , exportSort :: Sort
191 , exportFilter :: [Filter]
192 } deriving (Show,Generic,Eq)
193
194
195 ------------------------------------------------------------------------
196 data PhyloSubConfig =
197 PhyloSubConfig { _sc_phyloProximity :: Double
198 , _sc_phyloSynchrony :: Double
199 , _sc_phyloQuality :: Double
200 , _sc_timeUnit :: TimeUnit
201 , _sc_clique :: Cluster
202 , _sc_exportFilter :: Double
203 }
204 deriving (Show,Generic,Eq)
205
206
207 subConfig2config :: PhyloSubConfig -> PhyloConfig
208 subConfig2config subConfig = defaultConfig { phyloProximity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
209 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
210 , phyloQuality = Quality (_sc_phyloQuality subConfig) 1
211 , timeUnit = _sc_timeUnit subConfig
212 , clique = _sc_clique subConfig
213 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
214 }
215
216 ------------------------------------------------------------------------
217 defaultConfig :: PhyloConfig
218 defaultConfig =
219 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
220 , listPath = "list.csv" -- useful for commandline only
221 , outputPath = "data/"
222 , corpusParser = Csv 100000
223 , listParser = V4
224 , phyloName = pack "Phylo Name"
225 , phyloScale = 2
226 , phyloProximity = WeightedLogJaccard 0.5 1
227 , seaElevation = Constante 0.1 0.1
228 , findAncestors = False
229 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
230 , phyloQuality = Quality 0.5 1
231 , timeUnit = Year 3 1 5
232 , clique = MaxClique 5 0.0001 ByThreshold
233 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
234 , exportSort = ByHierarchy Desc
235 , exportFilter = [ByBranchSize 3]
236 }
237
238 -- Main Instances
239 instance ToSchema PhyloConfig
240 instance ToSchema PhyloSubConfig
241
242 instance FromJSON PhyloConfig
243 instance ToJSON PhyloConfig
244
245 instance FromJSON PhyloSubConfig
246 instance ToJSON PhyloSubConfig
247
248 instance FromJSON CorpusParser
249 instance ToJSON CorpusParser
250
251 instance FromJSON ListParser
252 instance ToJSON ListParser
253
254 instance FromJSON Proximity
255 instance ToJSON Proximity
256
257 instance FromJSON SeaElevation
258 instance ToJSON SeaElevation
259
260 instance FromJSON TimeUnit
261 instance ToJSON TimeUnit
262
263 instance FromJSON MaxCliqueFilter
264 instance ToJSON MaxCliqueFilter
265
266 instance FromJSON Cluster
267 instance ToJSON Cluster
268
269 instance FromJSON PhyloLabel
270 instance ToJSON PhyloLabel
271
272 instance FromJSON Tagger
273 instance ToJSON Tagger
274
275 instance FromJSON Sort
276 instance ToJSON Sort
277
278 instance FromJSON Order
279 instance ToJSON Order
280
281 instance FromJSON Filter
282 instance ToJSON Filter
283
284 instance FromJSON SynchronyScope
285 instance ToJSON SynchronyScope
286
287 instance FromJSON SynchronyStrategy
288 instance ToJSON SynchronyStrategy
289
290 instance FromJSON Synchrony
291 instance ToJSON Synchrony
292
293 instance FromJSON Quality
294 instance ToJSON Quality
295
296
297 -- | Software parameters
298 data Software =
299 Software { _software_name :: Text
300 , _software_version :: Text
301 } deriving (Generic, Show, Eq)
302
303 instance ToSchema Software where
304 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
305
306
307
308 defaultSoftware :: Software
309 defaultSoftware =
310 Software { _software_name = pack "GarganText"
311 , _software_version = pack "v5" }
312
313
314 -- | Global parameters of a Phylo
315 data PhyloParam =
316 PhyloParam { _phyloParam_version :: Text
317 , _phyloParam_software :: Software
318 , _phyloParam_config :: PhyloConfig
319 } deriving (Generic, Show, Eq)
320
321 instance ToSchema PhyloParam where
322 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
323
324
325
326 defaultPhyloParam :: PhyloParam
327 defaultPhyloParam =
328 PhyloParam { _phyloParam_version = pack "v3"
329 , _phyloParam_software = defaultSoftware
330 , _phyloParam_config = defaultConfig }
331
332
333 ------------------
334 -- | Document | --
335 ------------------
336
337 -- | Date : a simple Integer
338 type Date = Int
339
340 -- | DateStr : the string version of a Date
341 type DateStr = Text
342
343 -- | Ngrams : a contiguous sequence of n terms
344 type Ngrams = Text
345
346 -- Document : a piece of Text linked to a Date
347 -- date = computational date; date' = original string date yyyy-mm-dd
348 -- Export Database to Document
349 data Document = Document
350 { date :: Date -- datatype Date {unDate :: Int}
351 , date' :: DateStr -- show date
352 , text :: [Ngrams]
353 , weight :: Maybe Double
354 , sources :: [Text]
355 } deriving (Eq,Show,Generic,NFData)
356
357
358 --------------------
359 -- | Foundation | --
360 --------------------
361
362
363 -- | The Foundations of a Phylo created from a given TermList
364 data PhyloFoundations = PhyloFoundations
365 { _foundations_roots :: !(Vector Ngrams)
366 , _foundations_mapList :: TermList
367 } deriving (Generic, Show, Eq)
368
369 instance ToSchema PhyloFoundations where
370 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
371
372
373
374 data PhyloSources = PhyloSources
375 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
376
377 instance ToSchema PhyloSources where
378 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
379
380 ---------------------------
381 -- | Coocurency Matrix | --
382 ---------------------------
383
384
385 -- | Cooc : a coocurency matrix between two ngrams
386 type Cooc = Map (Int,Int) Double
387
388
389 -------------------
390 -- | Phylomemy | --
391 -------------------
392
393 -- | Period : a tuple of Dates
394 type Period = (Date,Date)
395
396 -- | PeriodStr : a tuple of DateStr
397 type PeriodStr = (DateStr,DateStr)
398
399
400 -- | Phylo datatype of a phylomemy
401 -- foundations : the foundations of the phylo
402 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
403 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
404 -- param : the parameters of the phylomemy (with the user's configuration)
405 -- periods : the temporal steps of a phylomemy
406 data Phylo =
407 Phylo { _phylo_foundations :: PhyloFoundations
408 , _phylo_sources :: PhyloSources
409 , _phylo_timeCooc :: !(Map Date Cooc)
410 , _phylo_timeDocs :: !(Map Date Double)
411 , _phylo_termFreq :: !(Map Int Double)
412 , _phylo_lastTermFreq :: !(Map Int Double)
413 , _phylo_diaSimScan :: Set Double
414 , _phylo_param :: PhyloParam
415 , _phylo_periods :: Map Period PhyloPeriod
416 , _phylo_quality :: Double
417 }
418 deriving (Generic, Show, Eq)
419
420 instance ToSchema Phylo where
421 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
422
423
424 ----------------
425 -- | Period | --
426 ----------------
427
428 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
429 -- id: tuple (start date, end date) of the temporal step of the phylomemy
430 -- scales: scales of synchronic description
431 data PhyloPeriod =
432 PhyloPeriod { _phylo_periodPeriod :: Period
433 , _phylo_periodPeriodStr :: PeriodStr
434 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
435 } deriving (Generic, Show, Eq)
436
437 instance ToSchema PhyloPeriod where
438 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
439
440 ---------------
441 -- | Scale | --
442 ---------------
443
444 -- | Scale : a scale of synchronic description
445 type Scale = Int
446
447 -- | PhyloScaleId : the id of a scale of synchronic description
448 type PhyloScaleId = (Period,Scale)
449
450 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
451 data PhyloScale =
452 PhyloScale { _phylo_scalePeriod :: Period
453 , _phylo_scalePeriodStr :: PeriodStr
454 , _phylo_scaleScale :: Scale
455 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
456 }
457 deriving (Generic, Show, Eq)
458
459 instance ToSchema PhyloScale where
460 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
461
462
463 type PhyloGroupId = (PhyloScaleId, Int)
464
465 -- | BranchId : (a scale, a sequence of branch index)
466 -- the sequence is a path of heritage from the most to the less specific branch
467 type PhyloBranchId = (Scale, [Int])
468
469 -- | PhyloGroup : group of ngrams at each scale and period
470 data PhyloGroup =
471 PhyloGroup { _phylo_groupPeriod :: Period
472 , _phylo_groupPeriod' :: (Text,Text)
473 , _phylo_groupScale :: Scale
474 , _phylo_groupIndex :: Int
475 , _phylo_groupLabel :: Text
476 , _phylo_groupSupport :: Support
477 , _phylo_groupWeight :: Maybe Double
478 , _phylo_groupSources :: [Int]
479 , _phylo_groupNgrams :: [Int]
480 , _phylo_groupCooc :: !(Cooc)
481 , _phylo_groupBranchId :: PhyloBranchId
482 , _phylo_groupMeta :: Map Text [Double]
483 , _phylo_groupScaleParents :: [Pointer]
484 , _phylo_groupScaleChilds :: [Pointer]
485 , _phylo_groupPeriodParents :: [Pointer]
486 , _phylo_groupPeriodChilds :: [Pointer]
487 , _phylo_groupAncestors :: [Pointer]
488 , _phylo_groupPeriodMemoryParents :: [Pointer']
489 , _phylo_groupPeriodMemoryChilds :: [Pointer']
490 }
491 deriving (Generic, Show, Eq, NFData)
492
493 instance ToSchema PhyloGroup where
494 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
495
496
497 -- | Weight : A generic mesure that can be associated with an Id
498 type Weight = Double
499 type Thr = Double
500
501 -- | Pointer : A weighted pointer to a given PhyloGroup
502 type Pointer = (PhyloGroupId, Weight)
503 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
504 type Pointer' = (PhyloGroupId, (Thr,Weight))
505
506 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
507 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
508
509
510 --------------------------
511 -- | Phylo Clustering | --
512 --------------------------
513
514 -- | Support : Number of Documents where a Cluster occurs
515 type Support = Int
516
517 data Clustering = Clustering
518 { _clustering_roots :: [Int]
519 , _clustering_support :: Support
520 , _clustering_period :: Period
521 -- additional materials for visualization
522 , _clustering_visWeighting :: Maybe Double
523 , _clustering_visFiltering :: [Int]
524 } deriving (Generic,NFData,Show,Eq)
525
526 ----------------
527 -- | Export | --
528 ----------------
529
530 type DotId = TextLazy.Text
531
532 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
533
534 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
535 instance ToSchema Filter where
536 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
537
538
539 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
540
541 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
542 instance ToSchema Sort where
543 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
544
545
546 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
547 instance ToSchema Tagger where
548 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
549
550
551 data PhyloLabel =
552 BranchLabel
553 { _branch_labelTagger :: Tagger
554 , _branch_labelSize :: Int }
555 | GroupLabel
556 { _group_labelTagger :: Tagger
557 , _group_labelSize :: Int }
558 deriving (Show,Generic,Eq)
559
560 instance ToSchema PhyloLabel where
561 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
562
563
564 data PhyloBranch =
565 PhyloBranch
566 { _branch_id :: PhyloBranchId
567 , _branch_canonId :: [Int]
568 , _branch_seaLevel :: [Double]
569 , _branch_x :: Double
570 , _branch_y :: Double
571 , _branch_w :: Double
572 , _branch_t :: Double
573 , _branch_label :: Text
574 , _branch_meta :: Map Text [Double]
575 } deriving (Generic, Show, Eq)
576
577 instance ToSchema PhyloBranch where
578 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
579
580 data PhyloExport =
581 PhyloExport
582 { _export_groups :: [PhyloGroup]
583 , _export_branches :: [PhyloBranch]
584 } deriving (Generic, Show)
585 instance ToSchema PhyloExport where
586 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
587
588
589 ----------------
590 -- | Lenses | --
591 ----------------
592
593 makeLenses ''PhyloConfig
594 makeLenses ''PhyloSubConfig
595 makeLenses ''Proximity
596 makeLenses ''SeaElevation
597 makeLenses ''Quality
598 makeLenses ''Cluster
599 makeLenses ''PhyloLabel
600 makeLenses ''TimeUnit
601 makeLenses ''PhyloFoundations
602 makeLenses ''Clustering
603 makeLenses ''Phylo
604 makeLenses ''PhyloPeriod
605 makeLenses ''PhyloScale
606 makeLenses ''PhyloGroup
607 makeLenses ''PhyloParam
608 makeLenses ''PhyloExport
609 makeLenses ''PhyloBranch
610
611 ------------------------
612 -- | JSON instances | --
613 ------------------------
614
615 instance FromJSON Phylo
616 instance ToJSON Phylo
617
618 instance FromJSON PhyloSources
619 instance ToJSON PhyloSources
620
621 instance FromJSON PhyloParam
622 instance ToJSON PhyloParam
623
624 instance FromJSON PhyloPeriod
625 instance ToJSON PhyloPeriod
626
627 instance FromJSON PhyloScale
628 instance ToJSON PhyloScale
629
630 instance FromJSON Software
631 instance ToJSON Software
632
633 instance FromJSON PhyloGroup
634 instance ToJSON PhyloGroup
635
636 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)