]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/AdaptativePhylo.hs
Merge branch 'dev' into 78-dev-list-ids
[gargantext.git] / src / Gargantext / Core / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.AdaptativePhylo where
28
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Text (Text, pack)
32 import Data.Vector (Vector)
33 import Data.Map (Map)
34
35 import Gargantext.Core.Utils.Prefix (unPrefix)
36 import Gargantext.Prelude
37 import Gargantext.Core.Text.Context (TermList)
38
39 import GHC.Generics
40 import GHC.IO (FilePath)
41 import Control.DeepSeq (NFData)
42 import Control.Lens (makeLenses)
43
44 import qualified Data.Text.Lazy as TextLazy
45
46
47 ----------------
48 -- | Config | --
49 ----------------
50
51
52 data CorpusParser =
53 Wos {_wos_limit :: Int}
54 | Csv {_csv_limit :: Int}
55 | Csv' {_csv'_limit :: Int}
56 deriving (Show,Generic,Eq)
57
58 data SeaElevation =
59 Constante
60 { _cons_start :: Double
61 , _cons_step :: Double }
62 | Adaptative
63 { _adap_granularity :: Double }
64 deriving (Show,Generic,Eq)
65
66 data Proximity =
67 WeightedLogJaccard
68 { _wlj_sensibility :: Double
69 {-
70 -- , _wlj_thresholdInit :: Double
71 -- , _wlj_thresholdStep :: Double
72 -- | max height for sea level in temporal matching
73 -- , _wlj_elevation :: Double
74 -}
75 }
76 | WeightedLogSim
77 { _wlj_sensibility :: Double
78 {-
79 -- , _wlj_thresholdInit :: Double
80 -- , _wlj_thresholdStep :: Double
81 -- | max height for sea level in temporal matching
82 -- , _wlj_elevation :: Double
83 -}
84 }
85 | Hamming
86 deriving (Show,Generic,Eq)
87
88
89 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
90
91 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
92
93 data Synchrony =
94 ByProximityThreshold
95 { _bpt_threshold :: Double
96 , _bpt_sensibility :: Double
97 , _bpt_scope :: SynchronyScope
98 , _bpt_strategy :: SynchronyStrategy }
99 | ByProximityDistribution
100 { _bpd_sensibility :: Double
101 , _bpd_strategy :: SynchronyStrategy }
102 deriving (Show,Generic,Eq)
103
104
105 data TimeUnit =
106 Year
107 { _year_period :: Int
108 , _year_step :: Int
109 , _year_matchingFrame :: Int }
110 | Month
111 { _month_period :: Int
112 , _month_step :: Int
113 , _month_matchingFrame :: Int }
114 | Week
115 { _week_period :: Int
116 , _week_step :: Int
117 , _week_matchingFrame :: Int }
118 | Day
119 { _day_period :: Int
120 , _day_step :: Int
121 , _day_matchingFrame :: Int }
122 deriving (Show,Generic,Eq)
123
124 data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
125
126 data Clique =
127 Fis
128 { _fis_support :: Int
129 , _fis_size :: Int }
130 | MaxClique
131 { _mcl_size :: Int
132 , _mcl_threshold :: Double
133 , _mcl_filter :: CliqueFilter }
134 deriving (Show,Generic,Eq)
135
136
137 data Quality =
138 Quality { _qua_granularity :: Double
139 , _qua_minBranch :: Int }
140 deriving (Show,Generic,Eq)
141
142
143 data Config =
144 Config { corpusPath :: FilePath
145 , listPath :: FilePath
146 , outputPath :: FilePath
147 , corpusParser :: CorpusParser
148 , phyloName :: Text
149 , phyloLevel :: Int
150 , phyloProximity :: Proximity
151 , seaElevation :: SeaElevation
152 , findAncestors :: Bool
153 , phyloSynchrony :: Synchrony
154 , phyloQuality :: Quality
155 , timeUnit :: TimeUnit
156 , clique :: Clique
157 , exportLabel :: [PhyloLabel]
158 , exportSort :: Sort
159 , exportFilter :: [Filter]
160 } deriving (Show,Generic,Eq)
161
162
163 defaultConfig :: Config
164 defaultConfig =
165 Config { corpusPath = ""
166 , listPath = ""
167 , outputPath = ""
168 , corpusParser = Csv 1000
169 , phyloName = pack "Default Phylo"
170 , phyloLevel = 2
171 , phyloProximity = WeightedLogJaccard 10
172 , seaElevation = Constante 0.1 0.1
173 , findAncestors = True
174 , phyloSynchrony = ByProximityThreshold 0.1 10 SiblingBranches MergeAllGroups
175 , phyloQuality = Quality 0 1
176 , timeUnit = Year 3 1 5
177 , clique = MaxClique 0 3 ByNeighbours
178 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
179 , exportSort = ByHierarchy
180 , exportFilter = [ByBranchSize 2]
181 }
182
183 instance FromJSON Config
184 instance ToJSON Config
185 instance FromJSON CorpusParser
186 instance ToJSON CorpusParser
187 instance FromJSON Proximity
188 instance ToJSON Proximity
189 instance FromJSON SeaElevation
190 instance ToJSON SeaElevation
191 instance FromJSON TimeUnit
192 instance ToJSON TimeUnit
193 instance FromJSON CliqueFilter
194 instance ToJSON CliqueFilter
195 instance FromJSON Clique
196 instance ToJSON Clique
197 instance FromJSON PhyloLabel
198 instance ToJSON PhyloLabel
199 instance FromJSON Tagger
200 instance ToJSON Tagger
201 instance FromJSON Sort
202 instance ToJSON Sort
203 instance FromJSON Order
204 instance ToJSON Order
205 instance FromJSON Filter
206 instance ToJSON Filter
207 instance FromJSON SynchronyScope
208 instance ToJSON SynchronyScope
209 instance FromJSON SynchronyStrategy
210 instance ToJSON SynchronyStrategy
211 instance FromJSON Synchrony
212 instance ToJSON Synchrony
213 instance FromJSON Quality
214 instance ToJSON Quality
215
216
217 -- | Software parameters
218 data Software =
219 Software { _software_name :: Text
220 , _software_version :: Text
221 } deriving (Generic, Show, Eq)
222
223 defaultSoftware :: Software
224 defaultSoftware =
225 Software { _software_name = pack "Gargantext"
226 , _software_version = pack "v4" }
227
228
229 -- | Global parameters of a Phylo
230 data PhyloParam =
231 PhyloParam { _phyloParam_version :: Text
232 , _phyloParam_software :: Software
233 , _phyloParam_config :: Config
234 } deriving (Generic, Show, Eq)
235
236 defaultPhyloParam :: PhyloParam
237 defaultPhyloParam =
238 PhyloParam { _phyloParam_version = pack "v2.adaptative"
239 , _phyloParam_software = defaultSoftware
240 , _phyloParam_config = defaultConfig }
241
242
243 ------------------
244 -- | Document | --
245 ------------------
246
247 -- | Date : a simple Integer
248 type Date = Int
249
250 -- | Ngrams : a contiguous sequence of n terms
251 type Ngrams = Text
252
253 -- Document : a piece of Text linked to a Date
254 -- date = computational date; date' = original string date yyyy-mm-dd
255 data Document = Document
256 { date :: Date
257 , date' :: Text
258 , text :: [Ngrams]
259 , weight :: Maybe Double
260 , sources :: [Text]
261 } deriving (Eq,Show,Generic,NFData)
262
263
264 --------------------
265 -- | Foundation | --
266 --------------------
267
268
269 -- | The Foundations of a Phylo created from a given TermList
270 data PhyloFoundations = PhyloFoundations
271 { _foundations_roots :: !(Vector Ngrams)
272 , _foundations_mapList :: TermList
273 } deriving (Generic, Show, Eq)
274
275
276 data PhyloSources = PhyloSources
277 { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
278
279
280 ---------------------------
281 -- | Coocurency Matrix | --
282 ---------------------------
283
284
285 -- | Cooc : a coocurency matrix between two ngrams
286 type Cooc = Map (Int,Int) Double
287
288
289 -------------------
290 -- | Phylomemy | --
291 -------------------
292
293
294 -- | Phylo datatype of a phylomemy
295 -- foundations : the foundations of the phylo
296 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
297 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
298 -- param : the parameters of the phylomemy (with the user's configuration)
299 -- periods : the temporal steps of a phylomemy
300 data Phylo =
301 Phylo { _phylo_foundations :: PhyloFoundations
302 , _phylo_sources :: PhyloSources
303 , _phylo_timeCooc :: !(Map Date Cooc)
304 , _phylo_timeDocs :: !(Map Date Double)
305 , _phylo_termFreq :: !(Map Int Double)
306 , _phylo_lastTermFreq :: !(Map Int Double)
307 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
308 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
309 , _phylo_param :: PhyloParam
310 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
311 }
312 deriving (Generic, Show, Eq)
313
314
315 -- | PhyloPeriodId : the id of a given period
316 type PhyloPeriodId = (Date,Date)
317
318 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
319 -- id: tuple (start date, end date) of the temporal step of the phylomemy
320 -- levels: levels of granularity
321 data PhyloPeriod =
322 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
323 , _phylo_periodPeriod' :: (Text,Text)
324 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
325 } deriving (Generic, Show, Eq)
326
327
328 -- | Level : a level of clustering
329 type Level = Int
330
331 -- | PhyloLevelId : the id of a level of clustering in a given period
332 type PhyloLevelId = (PhyloPeriodId,Level)
333
334 -- | PhyloLevel : levels of phylomemy on a synchronic axis
335 -- Levels description:
336 -- Level 0: The foundations and the base of the phylo
337 -- Level 1: First level of clustering (the Fis)
338 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
339 data PhyloLevel =
340 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
341 , _phylo_levelPeriod' :: (Text,Text)
342 , _phylo_levelLevel :: Level
343 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
344 }
345 deriving (Generic, Show, Eq)
346
347
348 type PhyloGroupId = (PhyloLevelId, Int)
349
350 -- | BranchId : (a level, a sequence of branch index)
351 -- the sequence is a path of heritage from the most to the less specific branch
352 type PhyloBranchId = (Level, [Int])
353
354 -- | PhyloGroup : group of ngrams at each level and period
355 data PhyloGroup =
356 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
357 , _phylo_groupPeriod' :: (Text,Text)
358 , _phylo_groupLevel :: Level
359 , _phylo_groupIndex :: Int
360 , _phylo_groupLabel :: Text
361 , _phylo_groupSupport :: Support
362 , _phylo_groupWeight :: Maybe Double
363 , _phylo_groupSources :: [Int]
364 , _phylo_groupNgrams :: [Int]
365 , _phylo_groupCooc :: !(Cooc)
366 , _phylo_groupBranchId :: PhyloBranchId
367 , _phylo_groupMeta :: Map Text [Double]
368 , _phylo_groupLevelParents :: [Pointer]
369 , _phylo_groupLevelChilds :: [Pointer]
370 , _phylo_groupPeriodParents :: [Pointer]
371 , _phylo_groupPeriodChilds :: [Pointer]
372 , _phylo_groupAncestors :: [Pointer]
373 }
374 deriving (Generic, Show, Eq, NFData)
375
376 -- | Weight : A generic mesure that can be associated with an Id
377 type Weight = Double
378
379 -- | Pointer : A weighted pointer to a given PhyloGroup
380 type Pointer = (PhyloGroupId, Weight)
381
382 data Filiation = ToParents | ToChilds deriving (Generic, Show)
383 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
384
385
386 ----------------------
387 -- | Phylo Clique | --
388 ----------------------
389
390 -- | Support : Number of Documents where a Clique occurs
391 type Support = Int
392
393 data PhyloClique = PhyloClique
394 { _phyloClique_nodes :: [Int]
395 , _phyloClique_support :: Support
396 , _phyloClique_period :: (Date,Date)
397 , _phyloClique_weight :: Maybe Double
398 , _phyloClique_sources :: [Int]
399 } deriving (Generic,NFData,Show,Eq)
400
401 ----------------
402 -- | Export | --
403 ----------------
404
405 type DotId = TextLazy.Text
406
407 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
408
409 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
410
411 data Order = Asc | Desc deriving (Show,Generic,Eq)
412
413 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
414
415 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
416
417 data PhyloLabel =
418 BranchLabel
419 { _branch_labelTagger :: Tagger
420 , _branch_labelSize :: Int }
421 | GroupLabel
422 { _group_labelTagger :: Tagger
423 , _group_labelSize :: Int }
424 deriving (Show,Generic,Eq)
425
426 data PhyloBranch =
427 PhyloBranch
428 { _branch_id :: PhyloBranchId
429 , _branch_canonId :: [Int]
430 , _branch_seaLevel :: [Double]
431 , _branch_x :: Double
432 , _branch_y :: Double
433 , _branch_w :: Double
434 , _branch_t :: Double
435 , _branch_label :: Text
436 , _branch_meta :: Map Text [Double]
437 } deriving (Generic, Show, Eq)
438
439 data PhyloExport =
440 PhyloExport
441 { _export_groups :: [PhyloGroup]
442 , _export_branches :: [PhyloBranch]
443 } deriving (Generic, Show)
444
445 ----------------
446 -- | Lenses | --
447 ----------------
448
449 makeLenses ''Config
450 makeLenses ''Proximity
451 makeLenses ''SeaElevation
452 makeLenses ''Quality
453 makeLenses ''Clique
454 makeLenses ''PhyloLabel
455 makeLenses ''TimeUnit
456 makeLenses ''PhyloFoundations
457 makeLenses ''PhyloClique
458 makeLenses ''Phylo
459 makeLenses ''PhyloPeriod
460 makeLenses ''PhyloLevel
461 makeLenses ''PhyloGroup
462 makeLenses ''PhyloParam
463 makeLenses ''PhyloExport
464 makeLenses ''PhyloBranch
465
466 ------------------------
467 -- | JSON instances | --
468 ------------------------
469
470 instance FromJSON Phylo
471 instance ToJSON Phylo
472 instance FromJSON PhyloSources
473 instance ToJSON PhyloSources
474 instance FromJSON PhyloParam
475 instance ToJSON PhyloParam
476 instance FromJSON PhyloPeriod
477 instance ToJSON PhyloPeriod
478 instance FromJSON PhyloLevel
479 instance ToJSON PhyloLevel
480 instance FromJSON Software
481 instance ToJSON Software
482 instance FromJSON PhyloGroup
483 instance ToJSON PhyloGroup
484
485 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)