]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
[REFACT] Text -> Core
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Viz.AdaptativePhylo where
28
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Text (Text, pack)
32 import Data.Vector (Vector)
33 import Data.Map (Map)
34
35 import Gargantext.Core.Utils.Prefix (unPrefix)
36 import Gargantext.Prelude
37 import Gargantext.Core.Text.Context (TermList)
38
39 import GHC.Generics
40 import GHC.IO (FilePath)
41 import Control.DeepSeq (NFData)
42 import Control.Lens (makeLenses)
43
44 import qualified Data.Text.Lazy as TextLazy
45
46
47 ----------------
48 -- | Config | --
49 ----------------
50
51
52 data CorpusParser =
53 Wos {_wos_limit :: Int}
54 | Csv {_csv_limit :: Int}
55 deriving (Show,Generic,Eq)
56
57 data SeaElevation =
58 Constante
59 { _cons_start :: Double
60 , _cons_step :: Double }
61 | Adaptative
62 { _adap_granularity :: Double }
63 deriving (Show,Generic,Eq)
64
65 data Proximity =
66 WeightedLogJaccard
67 { _wlj_sensibility :: Double
68 {-
69 -- , _wlj_thresholdInit :: Double
70 -- , _wlj_thresholdStep :: Double
71 -- | max height for sea level in temporal matching
72 -- , _wlj_elevation :: Double
73 -}
74 }
75 | Hamming
76 deriving (Show,Generic,Eq)
77
78
79 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
80
81 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
82
83 data Synchrony =
84 ByProximityThreshold
85 { _bpt_threshold :: Double
86 , _bpt_sensibility :: Double
87 , _bpt_scope :: SynchronyScope
88 , _bpt_strategy :: SynchronyStrategy }
89 | ByProximityDistribution
90 { _bpd_sensibility :: Double
91 , _bpd_strategy :: SynchronyStrategy }
92 deriving (Show,Generic,Eq)
93
94
95 data TimeUnit =
96 Year
97 { _year_period :: Int
98 , _year_step :: Int
99 , _year_matchingFrame :: Int }
100 deriving (Show,Generic,Eq)
101
102
103 data Clique =
104 Fis
105 { _fis_support :: Int
106 , _fis_size :: Int }
107 | MaxClique
108 { _mcl_size :: Int }
109 deriving (Show,Generic,Eq)
110
111
112 data Quality =
113 Quality { _qua_granularity :: Double
114 , _qua_minBranch :: Int }
115 deriving (Show,Generic,Eq)
116
117
118 data Config =
119 Config { corpusPath :: FilePath
120 , listPath :: FilePath
121 , outputPath :: FilePath
122 , corpusParser :: CorpusParser
123 , phyloName :: Text
124 , phyloLevel :: Int
125 , phyloProximity :: Proximity
126 , seaElevation :: SeaElevation
127 , phyloSynchrony :: Synchrony
128 , phyloQuality :: Quality
129 , timeUnit :: TimeUnit
130 , clique :: Clique
131 , exportLabel :: [PhyloLabel]
132 , exportSort :: Sort
133 , exportFilter :: [Filter]
134 } deriving (Show,Generic,Eq)
135
136
137 defaultConfig :: Config
138 defaultConfig =
139 Config { corpusPath = ""
140 , listPath = ""
141 , outputPath = ""
142 , corpusParser = Csv 1000
143 , phyloName = pack "Default Phylo"
144 , phyloLevel = 2
145 , phyloProximity = WeightedLogJaccard 10
146 , seaElevation = Constante 0.6 1
147 , phyloSynchrony = ByProximityThreshold 0.5 10 SiblingBranches MergeAllGroups
148 , phyloQuality = Quality 100 1
149 , timeUnit = Year 3 1 5
150 , clique = MaxClique 0
151 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
152 , exportSort = ByHierarchy
153 , exportFilter = [ByBranchSize 2]
154 }
155
156 instance FromJSON Config
157 instance ToJSON Config
158 instance FromJSON CorpusParser
159 instance ToJSON CorpusParser
160 instance FromJSON Proximity
161 instance ToJSON Proximity
162 instance FromJSON SeaElevation
163 instance ToJSON SeaElevation
164 instance FromJSON TimeUnit
165 instance ToJSON TimeUnit
166 instance FromJSON Clique
167 instance ToJSON Clique
168 instance FromJSON PhyloLabel
169 instance ToJSON PhyloLabel
170 instance FromJSON Tagger
171 instance ToJSON Tagger
172 instance FromJSON Sort
173 instance ToJSON Sort
174 instance FromJSON Order
175 instance ToJSON Order
176 instance FromJSON Filter
177 instance ToJSON Filter
178 instance FromJSON SynchronyScope
179 instance ToJSON SynchronyScope
180 instance FromJSON SynchronyStrategy
181 instance ToJSON SynchronyStrategy
182 instance FromJSON Synchrony
183 instance ToJSON Synchrony
184 instance FromJSON Quality
185 instance ToJSON Quality
186
187
188 -- | Software parameters
189 data Software =
190 Software { _software_name :: Text
191 , _software_version :: Text
192 } deriving (Generic, Show, Eq)
193
194 defaultSoftware :: Software
195 defaultSoftware =
196 Software { _software_name = pack "Gargantext"
197 , _software_version = pack "v4" }
198
199
200 -- | Global parameters of a Phylo
201 data PhyloParam =
202 PhyloParam { _phyloParam_version :: Text
203 , _phyloParam_software :: Software
204 , _phyloParam_config :: Config
205 } deriving (Generic, Show, Eq)
206
207 defaultPhyloParam :: PhyloParam
208 defaultPhyloParam =
209 PhyloParam { _phyloParam_version = pack "v2.adaptative"
210 , _phyloParam_software = defaultSoftware
211 , _phyloParam_config = defaultConfig }
212
213
214 ------------------
215 -- | Document | --
216 ------------------
217
218
219 -- | Date : a simple Integer
220 type Date = Int
221
222 -- | Ngrams : a contiguous sequence of n terms
223 type Ngrams = Text
224
225 -- | Document : a piece of Text linked to a Date
226 data Document = Document
227 { date :: Date
228 , text :: [Ngrams]
229 } deriving (Eq,Show,Generic,NFData)
230
231
232 --------------------
233 -- | Foundation | --
234 --------------------
235
236
237 -- | The Foundations of a Phylo created from a given TermList
238 data PhyloFoundations = PhyloFoundations
239 { _foundations_roots :: !(Vector Ngrams)
240 , _foundations_mapList :: TermList
241 } deriving (Generic, Show, Eq)
242
243
244 ---------------------------
245 -- | Coocurency Matrix | --
246 ---------------------------
247
248
249 -- | Cooc : a coocurency matrix between two ngrams
250 type Cooc = Map (Int,Int) Double
251
252
253 -------------------
254 -- | Phylomemy | --
255 -------------------
256
257
258 -- | Phylo datatype of a phylomemy
259 -- foundations : the foundations of the phylo
260 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
261 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
262 -- param : the parameters of the phylomemy (with the user's configuration)
263 -- periods : the temporal steps of a phylomemy
264 data Phylo =
265 Phylo { _phylo_foundations :: PhyloFoundations
266 , _phylo_timeCooc :: !(Map Date Cooc)
267 , _phylo_timeDocs :: !(Map Date Double)
268 , _phylo_termFreq :: !(Map Int Double)
269 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
270 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
271 , _phylo_param :: PhyloParam
272 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
273 }
274 deriving (Generic, Show, Eq)
275
276
277 -- | PhyloPeriodId : the id of a given period
278 type PhyloPeriodId = (Date,Date)
279
280 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
281 -- id: tuple (start date, end date) of the temporal step of the phylomemy
282 -- levels: levels of granularity
283 data PhyloPeriod =
284 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
285 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
286 } deriving (Generic, Show, Eq)
287
288
289 -- | Level : a level of clustering
290 type Level = Int
291
292 -- | PhyloLevelId : the id of a level of clustering in a given period
293 type PhyloLevelId = (PhyloPeriodId,Level)
294
295 -- | PhyloLevel : levels of phylomemy on a synchronic axis
296 -- Levels description:
297 -- Level 0: The foundations and the base of the phylo
298 -- Level 1: First level of clustering (the Fis)
299 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
300 data PhyloLevel =
301 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
302 , _phylo_levelLevel :: Level
303 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
304 }
305 deriving (Generic, Show, Eq)
306
307
308 type PhyloGroupId = (PhyloLevelId, Int)
309
310 -- | BranchId : (a level, a sequence of branch index)
311 -- the sequence is a path of heritage from the most to the less specific branch
312 type PhyloBranchId = (Level, [Int])
313
314 -- | PhyloGroup : group of ngrams at each level and period
315 data PhyloGroup =
316 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
317 , _phylo_groupLevel :: Level
318 , _phylo_groupIndex :: Int
319 , _phylo_groupLabel :: Text
320 , _phylo_groupSupport :: Support
321 , _phylo_groupNgrams :: [Int]
322 , _phylo_groupCooc :: !(Cooc)
323 , _phylo_groupBranchId :: PhyloBranchId
324 , _phylo_groupMeta :: Map Text [Double]
325 , _phylo_groupLevelParents :: [Pointer]
326 , _phylo_groupLevelChilds :: [Pointer]
327 , _phylo_groupPeriodParents :: [Pointer]
328 , _phylo_groupPeriodChilds :: [Pointer]
329 }
330 deriving (Generic, Show, Eq, NFData)
331
332 -- | Weight : A generic mesure that can be associated with an Id
333 type Weight = Double
334
335 -- | Pointer : A weighted pointer to a given PhyloGroup
336 type Pointer = (PhyloGroupId, Weight)
337
338 data Filiation = ToParents | ToChilds deriving (Generic, Show)
339 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
340
341
342 ----------------------
343 -- | Phylo Clique | --
344 ----------------------
345
346 -- | Support : Number of Documents where a Clique occurs
347 type Support = Int
348
349 data PhyloClique = PhyloClique
350 { _phyloClique_nodes :: [Int]
351 , _phyloClique_support :: Support
352 , _phyloClique_period :: (Date,Date)
353 } deriving (Generic,NFData,Show,Eq)
354
355
356 ------------------------
357 -- | Phylo Ancestor | --
358 ------------------------
359
360 data PhyloAncestor = PhyloAncestor
361 { _phyloAncestor_id :: Int
362 , _phyloAncestor_ngrams :: [Int]
363 , _phyloAncestor_groups :: [PhyloGroupId]
364 } deriving (Generic,NFData,Show,Eq)
365
366 ----------------
367 -- | Export | --
368 ----------------
369
370 type DotId = TextLazy.Text
371
372 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
373
374 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
375
376 data Order = Asc | Desc deriving (Show,Generic,Eq)
377
378 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
379
380 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
381
382 data PhyloLabel =
383 BranchLabel
384 { _branch_labelTagger :: Tagger
385 , _branch_labelSize :: Int }
386 | GroupLabel
387 { _group_labelTagger :: Tagger
388 , _group_labelSize :: Int }
389 deriving (Show,Generic,Eq)
390
391 data PhyloBranch =
392 PhyloBranch
393 { _branch_id :: PhyloBranchId
394 , _branch_canonId :: [Int]
395 , _branch_seaLevel :: [Double]
396 , _branch_x :: Double
397 , _branch_y :: Double
398 , _branch_w :: Double
399 , _branch_t :: Double
400 , _branch_label :: Text
401 , _branch_meta :: Map Text [Double]
402 } deriving (Generic, Show, Eq)
403
404 data PhyloExport =
405 PhyloExport
406 { _export_groups :: [PhyloGroup]
407 , _export_branches :: [PhyloBranch]
408 , _export_ancestors :: [PhyloAncestor]
409 } deriving (Generic, Show)
410
411 ----------------
412 -- | Lenses | --
413 ----------------
414
415 makeLenses ''Config
416 makeLenses ''Proximity
417 makeLenses ''SeaElevation
418 makeLenses ''Quality
419 makeLenses ''Clique
420 makeLenses ''PhyloLabel
421 makeLenses ''TimeUnit
422 makeLenses ''PhyloFoundations
423 makeLenses ''PhyloClique
424 makeLenses ''Phylo
425 makeLenses ''PhyloPeriod
426 makeLenses ''PhyloLevel
427 makeLenses ''PhyloGroup
428 makeLenses ''PhyloParam
429 makeLenses ''PhyloExport
430 makeLenses ''PhyloBranch
431
432 ------------------------
433 -- | JSON instances | --
434 ------------------------
435
436
437 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)