]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
working on ancestors
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60 data SeaElevation =
61 Constante
62 { _cons_start :: Double
63 , _cons_step :: Double }
64 | Adaptative
65 { _adap_granularity :: Double }
66 deriving (Show,Generic,Eq)
67
68 data Proximity =
69 WeightedLogJaccard
70 { _wlj_sensibility :: Double
71 -- , _wlj_thresholdInit :: Double
72 -- , _wlj_thresholdStep :: Double
73 -- | max height for sea level in temporal matching
74 -- , _wlj_elevation :: Double
75 }
76 | Hamming
77 deriving (Show,Generic,Eq)
78
79
80 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
81
82 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
83
84 data Synchrony =
85 ByProximityThreshold
86 { _bpt_threshold :: Double
87 , _bpt_sensibility :: Double
88 , _bpt_scope :: SynchronyScope
89 , _bpt_strategy :: SynchronyStrategy }
90 | ByProximityDistribution
91 { _bpd_sensibility :: Double
92 , _bpd_strategy :: SynchronyStrategy }
93 deriving (Show,Generic,Eq)
94
95
96 data TimeUnit =
97 Year
98 { _year_period :: Int
99 , _year_step :: Int
100 , _year_matchingFrame :: Int }
101 deriving (Show,Generic,Eq)
102
103
104 data Clique =
105 Fis
106 { _fis_support :: Int
107 , _fis_size :: Int }
108 | MaxClique
109 { _mcl_size :: Int }
110 deriving (Show,Generic,Eq)
111
112
113 data Quality =
114 Quality { _qua_granularity :: Double
115 , _qua_minBranch :: Int }
116 deriving (Show,Generic,Eq)
117
118
119 data Config =
120 Config { corpusPath :: FilePath
121 , listPath :: FilePath
122 , outputPath :: FilePath
123 , corpusParser :: CorpusParser
124 , phyloName :: Text
125 , phyloLevel :: Int
126 , phyloProximity :: Proximity
127 , seaElevation :: SeaElevation
128 , phyloSynchrony :: Synchrony
129 , phyloQuality :: Quality
130 , timeUnit :: TimeUnit
131 , clique :: Clique
132 , exportLabel :: [PhyloLabel]
133 , exportSort :: Sort
134 , exportFilter :: [Filter]
135 } deriving (Show,Generic,Eq)
136
137
138 defaultConfig :: Config
139 defaultConfig =
140 Config { corpusPath = ""
141 , listPath = ""
142 , outputPath = ""
143 , corpusParser = Csv 1000
144 , phyloName = pack "Default Phylo"
145 , phyloLevel = 2
146 , phyloProximity = WeightedLogJaccard 10
147 , seaElevation = Constante 0 0.1
148 , phyloSynchrony = ByProximityThreshold 0.5 10 SiblingBranches MergeAllGroups
149 , phyloQuality = Quality 1 1
150 , timeUnit = Year 3 1 5
151 , clique = Fis 1 5
152 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
153 , exportSort = ByHierarchy
154 , exportFilter = [ByBranchSize 2]
155 }
156
157 instance FromJSON Config
158 instance ToJSON Config
159 instance FromJSON CorpusParser
160 instance ToJSON CorpusParser
161 instance FromJSON Proximity
162 instance ToJSON Proximity
163 instance FromJSON SeaElevation
164 instance ToJSON SeaElevation
165 instance FromJSON TimeUnit
166 instance ToJSON TimeUnit
167 instance FromJSON Clique
168 instance ToJSON Clique
169 instance FromJSON PhyloLabel
170 instance ToJSON PhyloLabel
171 instance FromJSON Tagger
172 instance ToJSON Tagger
173 instance FromJSON Sort
174 instance ToJSON Sort
175 instance FromJSON Order
176 instance ToJSON Order
177 instance FromJSON Filter
178 instance ToJSON Filter
179 instance FromJSON SynchronyScope
180 instance ToJSON SynchronyScope
181 instance FromJSON SynchronyStrategy
182 instance ToJSON SynchronyStrategy
183 instance FromJSON Synchrony
184 instance ToJSON Synchrony
185 instance FromJSON Quality
186 instance ToJSON Quality
187
188
189 -- | Software parameters
190 data Software =
191 Software { _software_name :: Text
192 , _software_version :: Text
193 } deriving (Generic, Show, Eq)
194
195 defaultSoftware :: Software
196 defaultSoftware =
197 Software { _software_name = pack "Gargantext"
198 , _software_version = pack "v4" }
199
200
201 -- | Global parameters of a Phylo
202 data PhyloParam =
203 PhyloParam { _phyloParam_version :: Text
204 , _phyloParam_software :: Software
205 , _phyloParam_config :: Config
206 } deriving (Generic, Show, Eq)
207
208 defaultPhyloParam :: PhyloParam
209 defaultPhyloParam =
210 PhyloParam { _phyloParam_version = pack "v2.adaptative"
211 , _phyloParam_software = defaultSoftware
212 , _phyloParam_config = defaultConfig }
213
214
215 ------------------
216 -- | Document | --
217 ------------------
218
219
220 -- | Date : a simple Integer
221 type Date = Int
222
223 -- | Ngrams : a contiguous sequence of n terms
224 type Ngrams = Text
225
226 -- | Document : a piece of Text linked to a Date
227 data Document = Document
228 { date :: Date
229 , text :: [Ngrams]
230 } deriving (Eq,Show,Generic,NFData)
231
232
233 --------------------
234 -- | Foundation | --
235 --------------------
236
237
238 -- | The Foundations of a Phylo created from a given TermList
239 data PhyloFoundations = PhyloFoundations
240 { _foundations_roots :: !(Vector Ngrams)
241 , _foundations_mapList :: TermList
242 } deriving (Generic, Show, Eq)
243
244
245 ---------------------------
246 -- | Coocurency Matrix | --
247 ---------------------------
248
249
250 -- | Cooc : a coocurency matrix between two ngrams
251 type Cooc = Map (Int,Int) Double
252
253
254 -------------------
255 -- | Phylomemy | --
256 -------------------
257
258
259 -- | Phylo datatype of a phylomemy
260 -- foundations : the foundations of the phylo
261 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
262 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
263 -- param : the parameters of the phylomemy (with the user's configuration)
264 -- periods : the temporal steps of a phylomemy
265 data Phylo =
266 Phylo { _phylo_foundations :: PhyloFoundations
267 , _phylo_timeCooc :: !(Map Date Cooc)
268 , _phylo_timeDocs :: !(Map Date Double)
269 , _phylo_termFreq :: !(Map Int Double)
270 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
271 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
272 , _phylo_param :: PhyloParam
273 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
274 }
275 deriving (Generic, Show, Eq)
276
277
278 -- | PhyloPeriodId : the id of a given period
279 type PhyloPeriodId = (Date,Date)
280
281 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
282 -- id: tuple (start date, end date) of the temporal step of the phylomemy
283 -- levels: levels of granularity
284 data PhyloPeriod =
285 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
286 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
287 } deriving (Generic, Show, Eq)
288
289
290 -- | Level : a level of clustering
291 type Level = Int
292
293 -- | PhyloLevelId : the id of a level of clustering in a given period
294 type PhyloLevelId = (PhyloPeriodId,Level)
295
296 -- | PhyloLevel : levels of phylomemy on a synchronic axis
297 -- Levels description:
298 -- Level 0: The foundations and the base of the phylo
299 -- Level 1: First level of clustering (the Fis)
300 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
301 data PhyloLevel =
302 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
303 , _phylo_levelLevel :: Level
304 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
305 }
306 deriving (Generic, Show, Eq)
307
308
309 type PhyloGroupId = (PhyloLevelId, Int)
310
311 -- | BranchId : (a level, a sequence of branch index)
312 -- the sequence is a path of heritage from the most to the less specific branch
313 type PhyloBranchId = (Level, [Int])
314
315 -- | PhyloGroup : group of ngrams at each level and period
316 data PhyloGroup =
317 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
318 , _phylo_groupLevel :: Level
319 , _phylo_groupIndex :: Int
320 , _phylo_groupLabel :: Text
321 , _phylo_groupSupport :: Support
322 , _phylo_groupNgrams :: [Int]
323 , _phylo_groupCooc :: !(Cooc)
324 , _phylo_groupBranchId :: PhyloBranchId
325 , _phylo_groupMeta :: Map Text [Double]
326 , _phylo_groupLevelParents :: [Pointer]
327 , _phylo_groupLevelChilds :: [Pointer]
328 , _phylo_groupPeriodParents :: [Pointer]
329 , _phylo_groupPeriodChilds :: [Pointer]
330 }
331 deriving (Generic, Show, Eq, NFData)
332
333 -- | Weight : A generic mesure that can be associated with an Id
334 type Weight = Double
335
336 -- | Pointer : A weighted pointer to a given PhyloGroup
337 type Pointer = (PhyloGroupId, Weight)
338
339 data Filiation = ToParents | ToChilds deriving (Generic, Show)
340 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
341
342
343 ----------------------
344 -- | Phylo Clique | --
345 ----------------------
346
347 -- | Support : Number of Documents where a Clique occurs
348 type Support = Int
349
350 data PhyloClique = PhyloClique
351 { _phyloClique_nodes :: Set Ngrams
352 , _phyloClique_support :: Support
353 , _phyloClique_period :: (Date,Date)
354 } deriving (Generic,NFData,Show,Eq)
355
356
357 ----------------
358 -- | Export | --
359 ----------------
360
361 type DotId = TextLazy.Text
362
363 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
364
365 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
366
367 data Order = Asc | Desc deriving (Show,Generic,Eq)
368
369 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
370
371 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
372
373 data PhyloLabel =
374 BranchLabel
375 { _branch_labelTagger :: Tagger
376 , _branch_labelSize :: Int }
377 | GroupLabel
378 { _group_labelTagger :: Tagger
379 , _group_labelSize :: Int }
380 deriving (Show,Generic,Eq)
381
382 data PhyloBranch =
383 PhyloBranch
384 { _branch_id :: PhyloBranchId
385 , _branch_canonId :: [Int]
386 , _branch_seaLevel :: [Double]
387 , _branch_x :: Double
388 , _branch_y :: Double
389 , _branch_w :: Double
390 , _branch_t :: Double
391 , _branch_label :: Text
392 , _branch_meta :: Map Text [Double]
393 } deriving (Generic, Show, Eq)
394
395 data PhyloExport =
396 PhyloExport
397 { _export_groups :: [PhyloGroup]
398 , _export_branches :: [PhyloBranch]
399 } deriving (Generic, Show)
400
401 ----------------
402 -- | Lenses | --
403 ----------------
404
405 makeLenses ''Config
406 makeLenses ''Proximity
407 makeLenses ''SeaElevation
408 makeLenses ''Quality
409 makeLenses ''Clique
410 makeLenses ''PhyloLabel
411 makeLenses ''TimeUnit
412 makeLenses ''PhyloFoundations
413 makeLenses ''PhyloClique
414 makeLenses ''Phylo
415 makeLenses ''PhyloPeriod
416 makeLenses ''PhyloLevel
417 makeLenses ''PhyloGroup
418 makeLenses ''PhyloParam
419 makeLenses ''PhyloExport
420 makeLenses ''PhyloBranch
421
422 ------------------------
423 -- | JSON instances | --
424 ------------------------
425
426
427 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)