]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
[NLP] implement NLP server selection in INI file
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Swagger
35 import Data.Text (Text, pack)
36 import Data.Vector (Vector)
37 import GHC.Generics
38 import GHC.IO (FilePath)
39 import Gargantext.Core.Utils.Prefix (unPrefix)
40 import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
41 import Gargantext.Prelude
42 import qualified Data.Text.Lazy as TextLazy
43
44 ----------------
45 -- | PhyloConfig | --
46 ----------------
47
48 data CorpusParser =
49 Wos {_wos_limit :: Int}
50 | Csv {_csv_limit :: Int}
51 | Csv' {_csv'_limit :: Int}
52 deriving (Show,Generic,Eq)
53
54 instance ToSchema CorpusParser where
55 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
56
57
58 data ListParser = V3 | V4 deriving (Show,Generic,Eq)
59 instance ToSchema ListParser
60
61
62 data SeaElevation =
63 Constante
64 { _cons_start :: Double
65 , _cons_gap :: Double }
66 | Adaptative
67 { _adap_steps :: Double }
68 | Evolving
69 { _evol_neighborhood :: Bool }
70 deriving (Show,Generic,Eq)
71
72 instance ToSchema SeaElevation
73
74 data PhyloSimilarity =
75 WeightedLogJaccard
76 { _wlj_sensibility :: Double
77 , _wlj_minSharedNgrams :: Int }
78 | WeightedLogSim
79 { _wls_sensibility :: Double
80 , _wls_minSharedNgrams :: Int }
81 | Hamming
82 { _hmg_sensibility :: Double
83 , _hmg_minSharedNgrams :: Int}
84
85 deriving (Show,Generic,Eq)
86
87 instance ToSchema PhyloSimilarity where
88 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
89
90
91 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
92 deriving (Show,Generic,Eq, ToSchema)
93
94 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
95 deriving (Show,Generic,Eq)
96
97 instance ToSchema SynchronyStrategy where
98 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
99
100
101 data Synchrony =
102 ByProximityThreshold
103 { _bpt_threshold :: Double
104 , _bpt_sensibility :: Double
105 , _bpt_scope :: SynchronyScope
106 , _bpt_strategy :: SynchronyStrategy }
107 | ByProximityDistribution
108 { _bpd_sensibility :: Double
109 , _bpd_strategy :: SynchronyStrategy }
110 deriving (Show,Generic,Eq)
111
112 instance ToSchema Synchrony where
113 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
114
115
116
117 data TimeUnit =
118 Epoch
119 { _epoch_period :: Int
120 , _epoch_step :: Int
121 , _epoch_matchingFrame :: Int }
122 | Year
123 { _year_period :: Int
124 , _year_step :: Int
125 , _year_matchingFrame :: Int }
126 | Month
127 { _month_period :: Int
128 , _month_step :: Int
129 , _month_matchingFrame :: Int }
130 | Week
131 { _week_period :: Int
132 , _week_step :: Int
133 , _week_matchingFrame :: Int }
134 | Day
135 { _day_period :: Int
136 , _day_step :: Int
137 , _day_matchingFrame :: Int }
138 deriving (Show,Generic,Eq)
139
140 instance ToSchema TimeUnit where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
142
143
144 data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
145
146 instance ToSchema MaxCliqueFilter where
147 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
148
149
150
151 data Cluster =
152 Fis
153 { _fis_support :: Int
154 , _fis_size :: Int }
155 | MaxClique
156 { _mcl_size :: Int
157 , _mcl_threshold :: Double
158 , _mcl_filter :: MaxCliqueFilter }
159 deriving (Show,Generic,Eq)
160
161 instance ToSchema Cluster where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
163
164
165 data Quality =
166 Quality { _qua_granularity :: Double
167 , _qua_minBranch :: Int }
168 deriving (Show,Generic,Eq)
169
170 instance ToSchema Quality where
171 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
172
173
174 data PhyloConfig =
175 PhyloConfig { corpusPath :: FilePath
176 , listPath :: FilePath
177 , outputPath :: FilePath
178 , corpusParser :: CorpusParser
179 , listParser :: ListParser
180 , phyloName :: Text
181 , phyloScale :: Int
182 , similarity :: PhyloSimilarity
183 , seaElevation :: SeaElevation
184 , defaultMode :: Bool
185 , findAncestors :: Bool
186 , phyloSynchrony :: Synchrony
187 , phyloQuality :: Quality
188 , timeUnit :: TimeUnit
189 , clique :: Cluster
190 , exportLabel :: [PhyloLabel]
191 , exportSort :: Sort
192 , exportFilter :: [Filter]
193 } deriving (Show,Generic,Eq)
194
195
196 ------------------------------------------------------------------------
197 data PhyloSubConfig =
198 PhyloSubConfig { _sc_phyloProximity :: Double
199 , _sc_phyloSynchrony :: Double
200 , _sc_phyloQuality :: Double
201 , _sc_timeUnit :: TimeUnit
202 , _sc_clique :: Cluster
203 , _sc_exportFilter :: Double
204 }
205 deriving (Show,Generic,Eq)
206
207
208 subConfig2config :: PhyloSubConfig -> PhyloConfig
209 subConfig2config subConfig = defaultConfig { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
210 , phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
211 , phyloQuality = Quality (_sc_phyloQuality subConfig) 1
212 , timeUnit = _sc_timeUnit subConfig
213 , clique = _sc_clique subConfig
214 , exportFilter = [ByBranchSize $ _sc_exportFilter subConfig]
215 }
216
217 ------------------------------------------------------------------------
218 defaultConfig :: PhyloConfig
219 defaultConfig =
220 PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
221 , listPath = "list.csv" -- useful for commandline only
222 , outputPath = "data/"
223 , corpusParser = Csv 100000
224 , listParser = V4
225 , phyloName = pack "Phylo Name"
226 , phyloScale = 2
227 , similarity = WeightedLogJaccard 0.5 1
228 , seaElevation = Constante 0.1 0.1
229 , defaultMode = True
230 , findAncestors = False
231 , phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
232 , phyloQuality = Quality 0.5 1
233 , timeUnit = Year 3 1 5
234 , clique = MaxClique 5 0.0001 ByThreshold
235 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
236 , exportSort = ByHierarchy Desc
237 , exportFilter = [ByBranchSize 3]
238 }
239
240 -- Main Instances
241 instance ToSchema PhyloConfig
242 instance ToSchema PhyloSubConfig
243
244 instance FromJSON PhyloConfig
245 instance ToJSON PhyloConfig
246
247 instance FromJSON PhyloSubConfig
248 instance ToJSON PhyloSubConfig
249
250 instance FromJSON CorpusParser
251 instance ToJSON CorpusParser
252
253 instance FromJSON ListParser
254 instance ToJSON ListParser
255
256 instance FromJSON PhyloSimilarity
257 instance ToJSON PhyloSimilarity
258
259 instance FromJSON SeaElevation
260 instance ToJSON SeaElevation
261
262 instance FromJSON TimeUnit
263 instance ToJSON TimeUnit
264
265 instance FromJSON MaxCliqueFilter
266 instance ToJSON MaxCliqueFilter
267
268 instance FromJSON Cluster
269 instance ToJSON Cluster
270
271 instance FromJSON PhyloLabel
272 instance ToJSON PhyloLabel
273
274 instance FromJSON Tagger
275 instance ToJSON Tagger
276
277 instance FromJSON Sort
278 instance ToJSON Sort
279
280 instance FromJSON Order
281 instance ToJSON Order
282
283 instance FromJSON Filter
284 instance ToJSON Filter
285
286 instance FromJSON SynchronyScope
287 instance ToJSON SynchronyScope
288
289 instance FromJSON SynchronyStrategy
290 instance ToJSON SynchronyStrategy
291
292 instance FromJSON Synchrony
293 instance ToJSON Synchrony
294
295 instance FromJSON Quality
296 instance ToJSON Quality
297
298
299 -- | Software parameters
300 data Software =
301 Software { _software_name :: Text
302 , _software_version :: Text
303 } deriving (Generic, Show, Eq)
304
305 instance ToSchema Software where
306 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
307
308
309
310 defaultSoftware :: Software
311 defaultSoftware =
312 Software { _software_name = pack "GarganText"
313 , _software_version = pack "v5" }
314
315
316 -- | Global parameters of a Phylo
317 data PhyloParam =
318 PhyloParam { _phyloParam_version :: Text
319 , _phyloParam_software :: Software
320 , _phyloParam_config :: PhyloConfig
321 } deriving (Generic, Show, Eq)
322
323 instance ToSchema PhyloParam where
324 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
325
326
327
328 defaultPhyloParam :: PhyloParam
329 defaultPhyloParam =
330 PhyloParam { _phyloParam_version = pack "v3"
331 , _phyloParam_software = defaultSoftware
332 , _phyloParam_config = defaultConfig }
333
334
335 ------------------
336 -- | Document | --
337 ------------------
338
339 -- | Date : a simple Integer
340 type Date = Int
341
342 -- | DateStr : the string version of a Date
343 type DateStr = Text
344
345 -- | Ngrams : a contiguous sequence of n terms
346 type Ngrams = Text
347
348 -- Document : a piece of Text linked to a Date
349 -- date = computational date; date' = original string date yyyy-mm-dd
350 -- Export Database to Document
351 data Document = Document
352 { date :: Date -- datatype Date {unDate :: Int}
353 , date' :: DateStr -- show date
354 , text :: [Ngrams]
355 , weight :: Maybe Double
356 , sources :: [Text]
357 } deriving (Eq,Show,Generic,NFData)
358
359
360 --------------------
361 -- | Foundation | --
362 --------------------
363
364
365 -- | The Foundations of a Phylo created from a given TermList
366 data PhyloFoundations = PhyloFoundations
367 { _foundations_roots :: (Vector Ngrams)
368 , _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
369 } deriving (Generic, Show, Eq)
370
371 data PhyloCounts = PhyloCounts
372 { coocByDate :: !(Map Date Cooc)
373 , docsByDate :: !(Map Date Double)
374 , rootsCount :: !(Map Int Double)
375 , rootsFreq :: !(Map Int Double)
376 , lastRootsFreq :: !(Map Int Double)
377 } deriving (Generic, Show, Eq)
378
379 data PhyloSources = PhyloSources
380 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
381
382 instance ToSchema PhyloFoundations where
383 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
384 instance ToSchema PhyloCounts where
385 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
386 instance ToSchema PhyloSources where
387 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
388
389 ---------------------------
390 -- | Coocurency Matrix | --
391 ---------------------------
392
393
394 -- | Cooc : a coocurency matrix between two ngrams
395 type Cooc = Map (Int,Int) Double
396
397
398 -------------------
399 -- | Phylomemy | --
400 -------------------
401
402 -- | Period : a tuple of Dates
403 type Period = (Date,Date)
404
405 -- | PeriodStr : a tuple of DateStr
406 type PeriodStr = (DateStr,DateStr)
407
408
409
410
411 -- | Phylo datatype of a phylomemy
412 -- foundations : the foundations of the phylo
413 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
414 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
415 -- param : the parameters of the phylomemy (with the user's configuration)
416 -- periods : the temporal steps of a phylomemy
417 data Phylo =
418 Phylo { _phylo_foundations :: PhyloFoundations
419 , _phylo_sources :: PhyloSources
420 , _phylo_counts :: PhyloCounts
421 , _phylo_seaLadder :: [Double]
422 , _phylo_param :: PhyloParam
423 , _phylo_periods :: Map Period PhyloPeriod
424 , _phylo_quality :: Double
425 , _phylo_level :: Double
426 }
427 deriving (Generic, Show, Eq)
428
429 instance ToSchema Phylo where
430 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
431
432
433 ----------------
434 -- | Period | --
435 ----------------
436
437 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
438 -- id: tuple (start date, end date) of the temporal step of the phylomemy
439 -- scales: scales of synchronic description
440 data PhyloPeriod =
441 PhyloPeriod { _phylo_periodPeriod :: Period
442 , _phylo_periodPeriodStr :: PeriodStr
443 , _phylo_periodScales :: Map PhyloScaleId PhyloScale
444 } deriving (Generic, Show, Eq)
445
446 instance ToSchema PhyloPeriod where
447 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
448
449 ---------------
450 -- | Scale | --
451 ---------------
452
453 -- | Scale : a scale of synchronic description
454 type Scale = Int
455
456 -- | PhyloScaleId : the id of a scale of synchronic description
457 type PhyloScaleId = (Period,Scale)
458
459 -- | PhyloScale : sub-structure of the phylomemy in scale of synchronic description
460 data PhyloScale =
461 PhyloScale { _phylo_scalePeriod :: Period
462 , _phylo_scalePeriodStr :: PeriodStr
463 , _phylo_scaleScale :: Scale
464 , _phylo_scaleGroups :: Map PhyloGroupId PhyloGroup
465 }
466 deriving (Generic, Show, Eq)
467
468 instance ToSchema PhyloScale where
469 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
470
471
472 type PhyloGroupId = (PhyloScaleId, Int)
473
474 -- | BranchId : (a scale, a sequence of branch index)
475 -- the sequence is a path of heritage from the most to the less specific branch
476 type PhyloBranchId = (Scale, [Int])
477
478 -- | PhyloGroup : group of ngrams at each scale and period
479 data PhyloGroup =
480 PhyloGroup { _phylo_groupPeriod :: Period
481 , _phylo_groupPeriod' :: (Text,Text)
482 , _phylo_groupScale :: Scale
483 , _phylo_groupIndex :: Int
484 , _phylo_groupLabel :: Text
485 , _phylo_groupSupport :: Support
486 , _phylo_groupWeight :: Maybe Double
487 , _phylo_groupSources :: [Int]
488 , _phylo_groupNgrams :: [Int]
489 , _phylo_groupCooc :: !(Cooc)
490 , _phylo_groupBranchId :: PhyloBranchId
491 , _phylo_groupMeta :: Map Text [Double]
492 , _phylo_groupScaleParents :: [Pointer]
493 , _phylo_groupScaleChilds :: [Pointer]
494 , _phylo_groupPeriodParents :: [Pointer]
495 , _phylo_groupPeriodChilds :: [Pointer]
496 , _phylo_groupAncestors :: [Pointer]
497 , _phylo_groupPeriodMemoryParents :: [Pointer']
498 , _phylo_groupPeriodMemoryChilds :: [Pointer']
499 }
500 deriving (Generic, Show, Eq, NFData)
501
502 instance ToSchema PhyloGroup where
503 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
504
505
506 -- | Weight : A generic mesure that can be associated with an Id
507 type Weight = Double
508 type Thr = Double
509
510 -- | Pointer : A weighted pointer to a given PhyloGroup
511 type Pointer = (PhyloGroupId, Weight)
512 -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
513 type Pointer' = (PhyloGroupId, (Thr,Weight))
514
515 data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
516 data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
517
518
519 --------------------------
520 -- | Phylo Clustering | --
521 --------------------------
522
523 -- | Support : Number of Documents where a Cluster occurs
524 type Support = Int
525
526 data Clustering = Clustering
527 { _clustering_roots :: [Int]
528 , _clustering_support :: Support
529 , _clustering_period :: Period
530 -- additional materials for visualization
531 , _clustering_visWeighting :: Maybe Double
532 , _clustering_visFiltering :: [Int]
533 } deriving (Generic,NFData,Show,Eq)
534
535 ----------------
536 -- | Export | --
537 ----------------
538
539 type DotId = TextLazy.Text
540
541 data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
542
543 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
544 instance ToSchema Filter where
545 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
546
547
548 data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
549
550 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
551 instance ToSchema Sort where
552 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
553
554
555 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
556 instance ToSchema Tagger where
557 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
558
559
560 data PhyloLabel =
561 BranchLabel
562 { _branch_labelTagger :: Tagger
563 , _branch_labelSize :: Int }
564 | GroupLabel
565 { _group_labelTagger :: Tagger
566 , _group_labelSize :: Int }
567 deriving (Show,Generic,Eq)
568
569 instance ToSchema PhyloLabel where
570 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
571
572
573 data PhyloBranch =
574 PhyloBranch
575 { _branch_id :: PhyloBranchId
576 , _branch_canonId :: [Int]
577 , _branch_seaLevel :: [Double]
578 , _branch_x :: Double
579 , _branch_y :: Double
580 , _branch_w :: Double
581 , _branch_t :: Double
582 , _branch_label :: Text
583 , _branch_meta :: Map Text [Double]
584 } deriving (Generic, Show, Eq)
585
586 instance ToSchema PhyloBranch where
587 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
588
589 data PhyloExport =
590 PhyloExport
591 { _export_groups :: [PhyloGroup]
592 , _export_branches :: [PhyloBranch]
593 } deriving (Generic, Show)
594 instance ToSchema PhyloExport where
595 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
596
597
598 ----------------
599 -- | Lenses | --
600 ----------------
601
602 makeLenses ''PhyloConfig
603 makeLenses ''PhyloSubConfig
604 makeLenses ''PhyloSimilarity
605 makeLenses ''SeaElevation
606 makeLenses ''Quality
607 makeLenses ''Cluster
608 makeLenses ''PhyloLabel
609 makeLenses ''TimeUnit
610 makeLenses ''PhyloFoundations
611 makeLenses ''Clustering
612 makeLenses ''Phylo
613 makeLenses ''PhyloPeriod
614 makeLenses ''PhyloScale
615 makeLenses ''PhyloGroup
616 makeLenses ''PhyloParam
617 makeLenses ''PhyloExport
618 makeLenses ''PhyloBranch
619
620 ------------------------
621 -- | JSON instances | --
622 ------------------------
623
624 instance FromJSON Phylo
625 instance ToJSON Phylo
626
627 instance FromJSON PhyloSources
628 instance ToJSON PhyloSources
629
630 instance FromJSON PhyloParam
631 instance ToJSON PhyloParam
632
633 instance FromJSON PhyloCounts
634 instance ToJSON PhyloCounts
635
636 instance FromJSON PhyloPeriod
637 instance ToJSON PhyloPeriod
638
639 instance FromJSON PhyloScale
640 instance ToJSON PhyloScale
641
642 instance FromJSON Software
643 instance ToJSON Software
644
645 instance FromJSON PhyloGroup
646 instance ToJSON PhyloGroup
647
648 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)