]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo.hs
[FEAT][PHYLO] preparing integration to backend
[gargantext.git] / src / Gargantext / Core / Viz / Phylo.hs
1 {-|
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.Phylo where
28
29 import Control.DeepSeq (NFData)
30 import Control.Lens (makeLenses)
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Map (Map)
34 import Data.Text (Text, pack)
35 import Data.Vector (Vector)
36 import GHC.Generics
37 import GHC.IO (FilePath)
38 import Gargantext.Core.Text.Context (TermList)
39 import Gargantext.Core.Utils.Prefix (unPrefix)
40 import Gargantext.Prelude
41 import qualified Data.Text.Lazy as TextLazy
42
43 ----------------
44 -- | Config | --
45 ----------------
46
47 data CorpusParser =
48 Wos {_wos_limit :: Int}
49 | Csv {_csv_limit :: Int}
50 | Csv' {_csv'_limit :: Int}
51 deriving (Show,Generic,Eq)
52
53 data SeaElevation =
54 Constante
55 { _cons_start :: Double
56 , _cons_step :: Double }
57 | Adaptative
58 { _adap_granularity :: Double }
59 deriving (Show,Generic,Eq)
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 {-
65 -- , _wlj_thresholdInit :: Double
66 -- , _wlj_thresholdStep :: Double
67 -- | max height for sea level in temporal matching
68 -- , _wlj_elevation :: Double
69 -}
70 }
71 | WeightedLogSim
72 { _wlj_sensibility :: Double
73 {-
74 -- , _wlj_thresholdInit :: Double
75 -- , _wlj_thresholdStep :: Double
76 -- | max height for sea level in temporal matching
77 -- , _wlj_elevation :: Double
78 -}
79 }
80 | Hamming
81 deriving (Show,Generic,Eq)
82
83
84 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
85
86 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
87
88 data Synchrony =
89 ByProximityThreshold
90 { _bpt_threshold :: Double
91 , _bpt_sensibility :: Double
92 , _bpt_scope :: SynchronyScope
93 , _bpt_strategy :: SynchronyStrategy }
94 | ByProximityDistribution
95 { _bpd_sensibility :: Double
96 , _bpd_strategy :: SynchronyStrategy }
97 deriving (Show,Generic,Eq)
98
99
100 data TimeUnit =
101 Year
102 { _year_period :: Int
103 , _year_step :: Int
104 , _year_matchingFrame :: Int }
105 | Month
106 { _month_period :: Int
107 , _month_step :: Int
108 , _month_matchingFrame :: Int }
109 | Week
110 { _week_period :: Int
111 , _week_step :: Int
112 , _week_matchingFrame :: Int }
113 | Day
114 { _day_period :: Int
115 , _day_step :: Int
116 , _day_matchingFrame :: Int }
117 deriving (Show,Generic,Eq)
118
119 data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
120
121 data Clique =
122 Fis
123 { _fis_support :: Int
124 , _fis_size :: Int }
125 | MaxClique
126 { _mcl_size :: Int
127 , _mcl_threshold :: Double
128 , _mcl_filter :: CliqueFilter }
129 deriving (Show,Generic,Eq)
130
131
132 data Quality =
133 Quality { _qua_granularity :: Double
134 , _qua_minBranch :: Int }
135 deriving (Show,Generic,Eq)
136
137
138 data Config =
139 Config { corpusPath :: FilePath
140 , listPath :: FilePath
141 , outputPath :: FilePath
142 , corpusParser :: CorpusParser
143 , phyloName :: Text
144 , phyloLevel :: Int
145 , phyloProximity :: Proximity
146 , seaElevation :: SeaElevation
147 , findAncestors :: Bool
148 , phyloSynchrony :: Synchrony
149 , phyloQuality :: Quality
150 , timeUnit :: TimeUnit
151 , clique :: Clique
152 , exportLabel :: [PhyloLabel]
153 , exportSort :: Sort
154 , exportFilter :: [Filter]
155 } deriving (Show,Generic,Eq)
156
157
158 defaultConfig :: Config
159 defaultConfig =
160 Config { corpusPath = ""
161 , listPath = ""
162 , outputPath = ""
163 , corpusParser = Csv 1000
164 , phyloName = pack "Default Phylo"
165 , phyloLevel = 2
166 , phyloProximity = WeightedLogJaccard 10
167 , seaElevation = Constante 0.1 0.1
168 , findAncestors = True
169 , phyloSynchrony = ByProximityThreshold 0.1 10 SiblingBranches MergeAllGroups
170 , phyloQuality = Quality 0 1
171 , timeUnit = Year 3 1 5
172 , clique = MaxClique 0 3 ByNeighbours
173 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
174 , exportSort = ByHierarchy
175 , exportFilter = [ByBranchSize 2]
176 }
177
178 instance FromJSON Config
179 instance ToJSON Config
180
181 instance FromJSON CorpusParser
182 instance ToJSON CorpusParser
183
184 instance FromJSON Proximity
185 instance ToJSON Proximity
186
187 instance FromJSON SeaElevation
188 instance ToJSON SeaElevation
189
190 instance FromJSON TimeUnit
191 instance ToJSON TimeUnit
192
193 instance FromJSON CliqueFilter
194 instance ToJSON CliqueFilter
195
196 instance FromJSON Clique
197 instance ToJSON Clique
198
199 instance FromJSON PhyloLabel
200 instance ToJSON PhyloLabel
201
202 instance FromJSON Tagger
203 instance ToJSON Tagger
204
205 instance FromJSON Sort
206 instance ToJSON Sort
207
208 instance FromJSON Order
209 instance ToJSON Order
210
211 instance FromJSON Filter
212 instance ToJSON Filter
213
214 instance FromJSON SynchronyScope
215 instance ToJSON SynchronyScope
216
217 instance FromJSON SynchronyStrategy
218 instance ToJSON SynchronyStrategy
219
220 instance FromJSON Synchrony
221 instance ToJSON Synchrony
222
223 instance FromJSON Quality
224 instance ToJSON Quality
225
226
227 -- | Software parameters
228 data Software =
229 Software { _software_name :: Text
230 , _software_version :: Text
231 } deriving (Generic, Show, Eq)
232
233 defaultSoftware :: Software
234 defaultSoftware =
235 Software { _software_name = pack "Gargantext"
236 , _software_version = pack "v4" }
237
238
239 -- | Global parameters of a Phylo
240 data PhyloParam =
241 PhyloParam { _phyloParam_version :: Text
242 , _phyloParam_software :: Software
243 , _phyloParam_config :: Config
244 } deriving (Generic, Show, Eq)
245
246 defaultPhyloParam :: PhyloParam
247 defaultPhyloParam =
248 PhyloParam { _phyloParam_version = pack "v2.adaptative"
249 , _phyloParam_software = defaultSoftware
250 , _phyloParam_config = defaultConfig }
251
252
253 ------------------
254 -- | Document | --
255 ------------------
256
257 -- | Date : a simple Integer
258 type Date = Int
259
260 -- | Ngrams : a contiguous sequence of n terms
261 type Ngrams = Text
262
263 -- Document : a piece of Text linked to a Date
264 -- date = computational date; date' = original string date yyyy-mm-dd
265 -- Export Database to Document
266 data Document = Document
267 { date :: Date -- datatype Date {unDate :: Int}
268 , date' :: Text -- show date
269 , text :: [Ngrams]
270 , weight :: Maybe Double
271 , sources :: [Text]
272 } deriving (Eq,Show,Generic,NFData)
273
274
275 --------------------
276 -- | Foundation | --
277 --------------------
278
279
280 -- | The Foundations of a Phylo created from a given TermList
281 data PhyloFoundations = PhyloFoundations
282 { _foundations_roots :: !(Vector Ngrams)
283 , _foundations_mapList :: TermList
284 } deriving (Generic, Show, Eq)
285
286
287 data PhyloSources = PhyloSources
288 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
289
290
291 ---------------------------
292 -- | Coocurency Matrix | --
293 ---------------------------
294
295
296 -- | Cooc : a coocurency matrix between two ngrams
297 type Cooc = Map (Int,Int) Double
298
299
300 -------------------
301 -- | Phylomemy | --
302 -------------------
303
304
305 -- | Phylo datatype of a phylomemy
306 -- foundations : the foundations of the phylo
307 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
308 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
309 -- param : the parameters of the phylomemy (with the user's configuration)
310 -- periods : the temporal steps of a phylomemy
311 data Phylo =
312 Phylo { _phylo_foundations :: PhyloFoundations
313 , _phylo_sources :: PhyloSources
314 , _phylo_timeCooc :: !(Map Date Cooc)
315 , _phylo_timeDocs :: !(Map Date Double)
316 , _phylo_termFreq :: !(Map Int Double)
317 , _phylo_lastTermFreq :: !(Map Int Double)
318 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
319 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
320 , _phylo_param :: PhyloParam
321 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
322 }
323 deriving (Generic, Show, Eq)
324
325
326 -- | PhyloPeriodId : the id of a given period
327 type PhyloPeriodId = (Date,Date)
328
329 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
330 -- id: tuple (start date, end date) of the temporal step of the phylomemy
331 -- levels: levels of granularity
332 data PhyloPeriod =
333 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
334 , _phylo_periodPeriod' :: (Text,Text)
335 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
336 } deriving (Generic, Show, Eq)
337
338
339 -- | Level : a level of clustering
340 type Level = Int
341
342 -- | PhyloLevelId : the id of a level of clustering in a given period
343 type PhyloLevelId = (PhyloPeriodId,Level)
344
345 -- | PhyloLevel : levels of phylomemy on a synchronic axis
346 -- Levels description:
347 -- Level 0: The foundations and the base of the phylo
348 -- Level 1: First level of clustering (the Fis)
349 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
350 data PhyloLevel =
351 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
352 , _phylo_levelPeriod' :: (Text,Text)
353 , _phylo_levelLevel :: Level
354 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
355 }
356 deriving (Generic, Show, Eq)
357
358
359 type PhyloGroupId = (PhyloLevelId, Int)
360
361 -- | BranchId : (a level, a sequence of branch index)
362 -- the sequence is a path of heritage from the most to the less specific branch
363 type PhyloBranchId = (Level, [Int])
364
365 -- | PhyloGroup : group of ngrams at each level and period
366 data PhyloGroup =
367 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
368 , _phylo_groupPeriod' :: (Text,Text)
369 , _phylo_groupLevel :: Level
370 , _phylo_groupIndex :: Int
371 , _phylo_groupLabel :: Text
372 , _phylo_groupSupport :: Support
373 , _phylo_groupWeight :: Maybe Double
374 , _phylo_groupSources :: [Int]
375 , _phylo_groupNgrams :: [Int]
376 , _phylo_groupCooc :: !(Cooc)
377 , _phylo_groupBranchId :: PhyloBranchId
378 , _phylo_groupMeta :: Map Text [Double]
379 , _phylo_groupLevelParents :: [Pointer]
380 , _phylo_groupLevelChilds :: [Pointer]
381 , _phylo_groupPeriodParents :: [Pointer]
382 , _phylo_groupPeriodChilds :: [Pointer]
383 , _phylo_groupAncestors :: [Pointer]
384 }
385 deriving (Generic, Show, Eq, NFData)
386
387 -- | Weight : A generic mesure that can be associated with an Id
388 type Weight = Double
389
390 -- | Pointer : A weighted pointer to a given PhyloGroup
391 type Pointer = (PhyloGroupId, Weight)
392
393 data Filiation = ToParents | ToChilds deriving (Generic, Show)
394 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
395
396
397 ----------------------
398 -- | Phylo Clique | --
399 ----------------------
400
401 -- | Support : Number of Documents where a Clique occurs
402 type Support = Int
403
404 data PhyloClique = PhyloClique
405 { _phyloClique_nodes :: [Int]
406 , _phyloClique_support :: Support
407 , _phyloClique_period :: (Date,Date)
408 , _phyloClique_weight :: Maybe Double
409 , _phyloClique_sources :: [Int]
410 } deriving (Generic,NFData,Show,Eq)
411
412 ----------------
413 -- | Export | --
414 ----------------
415
416 type DotId = TextLazy.Text
417
418 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
419
420 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
421
422 data Order = Asc | Desc deriving (Show,Generic,Eq)
423
424 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
425
426 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
427
428 data PhyloLabel =
429 BranchLabel
430 { _branch_labelTagger :: Tagger
431 , _branch_labelSize :: Int }
432 | GroupLabel
433 { _group_labelTagger :: Tagger
434 , _group_labelSize :: Int }
435 deriving (Show,Generic,Eq)
436
437 data PhyloBranch =
438 PhyloBranch
439 { _branch_id :: PhyloBranchId
440 , _branch_canonId :: [Int]
441 , _branch_seaLevel :: [Double]
442 , _branch_x :: Double
443 , _branch_y :: Double
444 , _branch_w :: Double
445 , _branch_t :: Double
446 , _branch_label :: Text
447 , _branch_meta :: Map Text [Double]
448 } deriving (Generic, Show, Eq)
449
450 data PhyloExport =
451 PhyloExport
452 { _export_groups :: [PhyloGroup]
453 , _export_branches :: [PhyloBranch]
454 } deriving (Generic, Show)
455
456 ----------------
457 -- | Lenses | --
458 ----------------
459
460 makeLenses ''Config
461 makeLenses ''Proximity
462 makeLenses ''SeaElevation
463 makeLenses ''Quality
464 makeLenses ''Clique
465 makeLenses ''PhyloLabel
466 makeLenses ''TimeUnit
467 makeLenses ''PhyloFoundations
468 makeLenses ''PhyloClique
469 makeLenses ''Phylo
470 makeLenses ''PhyloPeriod
471 makeLenses ''PhyloLevel
472 makeLenses ''PhyloGroup
473 makeLenses ''PhyloParam
474 makeLenses ''PhyloExport
475 makeLenses ''PhyloBranch
476
477 ------------------------
478 -- | JSON instances | --
479 ------------------------
480
481 instance FromJSON Phylo
482 instance ToJSON Phylo
483
484 instance FromJSON PhyloSources
485 instance ToJSON PhyloSources
486
487 instance FromJSON PhyloParam
488 instance ToJSON PhyloParam
489
490 instance FromJSON PhyloPeriod
491 instance ToJSON PhyloPeriod
492
493 instance FromJSON PhyloLevel
494 instance ToJSON PhyloLevel
495
496 instance FromJSON Software
497 instance ToJSON Software
498
499 instance FromJSON PhyloGroup
500 instance ToJSON PhyloGroup
501
502 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)