]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/AdaptativePhylo.hs
add weighted csv
[gargantext.git] / src / Gargantext / Core / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.AdaptativePhylo where
28
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Text (Text, pack)
32 import Data.Vector (Vector)
33 import Data.Map (Map)
34
35 import Gargantext.Core.Utils.Prefix (unPrefix)
36 import Gargantext.Prelude
37 import Gargantext.Core.Text.Context (TermList)
38
39 import GHC.Generics
40 import GHC.IO (FilePath)
41 import Control.DeepSeq (NFData)
42 import Control.Lens (makeLenses)
43
44 import qualified Data.Text.Lazy as TextLazy
45
46
47 ----------------
48 -- | Config | --
49 ----------------
50
51
52 data CorpusParser =
53 Wos {_wos_limit :: Int}
54 | Csv {_csv_limit :: Int}
55 | CsvWeighted {_csvw_limit :: Int}
56 deriving (Show,Generic,Eq)
57
58 data SeaElevation =
59 Constante
60 { _cons_start :: Double
61 , _cons_step :: Double }
62 | Adaptative
63 { _adap_granularity :: Double }
64 deriving (Show,Generic,Eq)
65
66 data Proximity =
67 WeightedLogJaccard
68 { _wlj_sensibility :: Double
69 {-
70 -- , _wlj_thresholdInit :: Double
71 -- , _wlj_thresholdStep :: Double
72 -- | max height for sea level in temporal matching
73 -- , _wlj_elevation :: Double
74 -}
75 }
76 | WeightedLogSim
77 { _wlj_sensibility :: Double
78 {-
79 -- , _wlj_thresholdInit :: Double
80 -- , _wlj_thresholdStep :: Double
81 -- | max height for sea level in temporal matching
82 -- , _wlj_elevation :: Double
83 -}
84 }
85 | Hamming
86 deriving (Show,Generic,Eq)
87
88
89 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
90
91 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
92
93 data Synchrony =
94 ByProximityThreshold
95 { _bpt_threshold :: Double
96 , _bpt_sensibility :: Double
97 , _bpt_scope :: SynchronyScope
98 , _bpt_strategy :: SynchronyStrategy }
99 | ByProximityDistribution
100 { _bpd_sensibility :: Double
101 , _bpd_strategy :: SynchronyStrategy }
102 deriving (Show,Generic,Eq)
103
104
105 data TimeUnit =
106 Year
107 { _year_period :: Int
108 , _year_step :: Int
109 , _year_matchingFrame :: Int }
110 deriving (Show,Generic,Eq)
111
112 data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
113
114 data Clique =
115 Fis
116 { _fis_support :: Int
117 , _fis_size :: Int }
118 | MaxClique
119 { _mcl_size :: Int
120 , _mcl_threshold :: Double
121 , _mcl_filter :: CliqueFilter }
122 deriving (Show,Generic,Eq)
123
124
125 data Quality =
126 Quality { _qua_granularity :: Double
127 , _qua_minBranch :: Int }
128 deriving (Show,Generic,Eq)
129
130
131 data Config =
132 Config { corpusPath :: FilePath
133 , listPath :: FilePath
134 , outputPath :: FilePath
135 , corpusParser :: CorpusParser
136 , phyloName :: Text
137 , phyloLevel :: Int
138 , phyloProximity :: Proximity
139 , seaElevation :: SeaElevation
140 , findAncestors :: Bool
141 , phyloSynchrony :: Synchrony
142 , phyloQuality :: Quality
143 , timeUnit :: TimeUnit
144 , clique :: Clique
145 , exportLabel :: [PhyloLabel]
146 , exportSort :: Sort
147 , exportFilter :: [Filter]
148 } deriving (Show,Generic,Eq)
149
150
151 defaultConfig :: Config
152 defaultConfig =
153 Config { corpusPath = ""
154 , listPath = ""
155 , outputPath = ""
156 , corpusParser = Csv 1000
157 , phyloName = pack "Default Phylo"
158 , phyloLevel = 2
159 , phyloProximity = WeightedLogJaccard 10
160 , seaElevation = Constante 0.1 0.1
161 , findAncestors = True
162 , phyloSynchrony = ByProximityThreshold 0.1 10 SiblingBranches MergeAllGroups
163 , phyloQuality = Quality 0 1
164 , timeUnit = Year 3 1 5
165 , clique = MaxClique 0 3 ByNeighbours
166 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
167 , exportSort = ByHierarchy
168 , exportFilter = [ByBranchSize 2]
169 }
170
171 instance FromJSON Config
172 instance ToJSON Config
173 instance FromJSON CorpusParser
174 instance ToJSON CorpusParser
175 instance FromJSON Proximity
176 instance ToJSON Proximity
177 instance FromJSON SeaElevation
178 instance ToJSON SeaElevation
179 instance FromJSON TimeUnit
180 instance ToJSON TimeUnit
181 instance FromJSON CliqueFilter
182 instance ToJSON CliqueFilter
183 instance FromJSON Clique
184 instance ToJSON Clique
185 instance FromJSON PhyloLabel
186 instance ToJSON PhyloLabel
187 instance FromJSON Tagger
188 instance ToJSON Tagger
189 instance FromJSON Sort
190 instance ToJSON Sort
191 instance FromJSON Order
192 instance ToJSON Order
193 instance FromJSON Filter
194 instance ToJSON Filter
195 instance FromJSON SynchronyScope
196 instance ToJSON SynchronyScope
197 instance FromJSON SynchronyStrategy
198 instance ToJSON SynchronyStrategy
199 instance FromJSON Synchrony
200 instance ToJSON Synchrony
201 instance FromJSON Quality
202 instance ToJSON Quality
203
204
205 -- | Software parameters
206 data Software =
207 Software { _software_name :: Text
208 , _software_version :: Text
209 } deriving (Generic, Show, Eq)
210
211 defaultSoftware :: Software
212 defaultSoftware =
213 Software { _software_name = pack "Gargantext"
214 , _software_version = pack "v4" }
215
216
217 -- | Global parameters of a Phylo
218 data PhyloParam =
219 PhyloParam { _phyloParam_version :: Text
220 , _phyloParam_software :: Software
221 , _phyloParam_config :: Config
222 } deriving (Generic, Show, Eq)
223
224 defaultPhyloParam :: PhyloParam
225 defaultPhyloParam =
226 PhyloParam { _phyloParam_version = pack "v2.adaptative"
227 , _phyloParam_software = defaultSoftware
228 , _phyloParam_config = defaultConfig }
229
230
231 ------------------
232 -- | Document | --
233 ------------------
234
235 -- | Date : a simple Integer
236 type Date = Int
237
238 -- | Ngrams : a contiguous sequence of n terms
239 type Ngrams = Text
240
241 -- | Document : a piece of Text linked to a Date
242 data Document = Document
243 { date :: Date
244 , text :: [Ngrams]
245 , weight :: Maybe Double
246 } deriving (Eq,Show,Generic,NFData)
247
248
249 --------------------
250 -- | Foundation | --
251 --------------------
252
253
254 -- | The Foundations of a Phylo created from a given TermList
255 data PhyloFoundations = PhyloFoundations
256 { _foundations_roots :: !(Vector Ngrams)
257 , _foundations_mapList :: TermList
258 } deriving (Generic, Show, Eq)
259
260
261 ---------------------------
262 -- | Coocurency Matrix | --
263 ---------------------------
264
265
266 -- | Cooc : a coocurency matrix between two ngrams
267 type Cooc = Map (Int,Int) Double
268
269
270 -------------------
271 -- | Phylomemy | --
272 -------------------
273
274
275 -- | Phylo datatype of a phylomemy
276 -- foundations : the foundations of the phylo
277 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
278 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
279 -- param : the parameters of the phylomemy (with the user's configuration)
280 -- periods : the temporal steps of a phylomemy
281 data Phylo =
282 Phylo { _phylo_foundations :: PhyloFoundations
283 , _phylo_timeCooc :: !(Map Date Cooc)
284 , _phylo_timeDocs :: !(Map Date Double)
285 , _phylo_termFreq :: !(Map Int Double)
286 , _phylo_lastTermFreq :: !(Map Int Double)
287 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
288 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
289 , _phylo_param :: PhyloParam
290 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
291 }
292 deriving (Generic, Show, Eq)
293
294
295 -- | PhyloPeriodId : the id of a given period
296 type PhyloPeriodId = (Date,Date)
297
298 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
299 -- id: tuple (start date, end date) of the temporal step of the phylomemy
300 -- levels: levels of granularity
301 data PhyloPeriod =
302 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
303 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
304 } deriving (Generic, Show, Eq)
305
306
307 -- | Level : a level of clustering
308 type Level = Int
309
310 -- | PhyloLevelId : the id of a level of clustering in a given period
311 type PhyloLevelId = (PhyloPeriodId,Level)
312
313 -- | PhyloLevel : levels of phylomemy on a synchronic axis
314 -- Levels description:
315 -- Level 0: The foundations and the base of the phylo
316 -- Level 1: First level of clustering (the Fis)
317 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
318 data PhyloLevel =
319 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
320 , _phylo_levelLevel :: Level
321 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
322 }
323 deriving (Generic, Show, Eq)
324
325
326 type PhyloGroupId = (PhyloLevelId, Int)
327
328 -- | BranchId : (a level, a sequence of branch index)
329 -- the sequence is a path of heritage from the most to the less specific branch
330 type PhyloBranchId = (Level, [Int])
331
332 -- | PhyloGroup : group of ngrams at each level and period
333 data PhyloGroup =
334 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
335 , _phylo_groupLevel :: Level
336 , _phylo_groupIndex :: Int
337 , _phylo_groupLabel :: Text
338 , _phylo_groupSupport :: Support
339 , _phylo_groupWeight :: Maybe Double
340 , _phylo_groupNgrams :: [Int]
341 , _phylo_groupCooc :: !(Cooc)
342 , _phylo_groupBranchId :: PhyloBranchId
343 , _phylo_groupMeta :: Map Text [Double]
344 , _phylo_groupLevelParents :: [Pointer]
345 , _phylo_groupLevelChilds :: [Pointer]
346 , _phylo_groupPeriodParents :: [Pointer]
347 , _phylo_groupPeriodChilds :: [Pointer]
348 , _phylo_groupAncestors :: [Pointer]
349 }
350 deriving (Generic, Show, Eq, NFData)
351
352 -- | Weight : A generic mesure that can be associated with an Id
353 type Weight = Double
354
355 -- | Pointer : A weighted pointer to a given PhyloGroup
356 type Pointer = (PhyloGroupId, Weight)
357
358 data Filiation = ToParents | ToChilds deriving (Generic, Show)
359 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
360
361
362 ----------------------
363 -- | Phylo Clique | --
364 ----------------------
365
366 -- | Support : Number of Documents where a Clique occurs
367 type Support = Int
368
369 data PhyloClique = PhyloClique
370 { _phyloClique_nodes :: [Int]
371 , _phyloClique_support :: Support
372 , _phyloClique_period :: (Date,Date)
373 , _phyloClique_weight :: Maybe Double
374 } deriving (Generic,NFData,Show,Eq)
375
376 ----------------
377 -- | Export | --
378 ----------------
379
380 type DotId = TextLazy.Text
381
382 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
383
384 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
385
386 data Order = Asc | Desc deriving (Show,Generic,Eq)
387
388 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
389
390 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
391
392 data PhyloLabel =
393 BranchLabel
394 { _branch_labelTagger :: Tagger
395 , _branch_labelSize :: Int }
396 | GroupLabel
397 { _group_labelTagger :: Tagger
398 , _group_labelSize :: Int }
399 deriving (Show,Generic,Eq)
400
401 data PhyloBranch =
402 PhyloBranch
403 { _branch_id :: PhyloBranchId
404 , _branch_canonId :: [Int]
405 , _branch_seaLevel :: [Double]
406 , _branch_x :: Double
407 , _branch_y :: Double
408 , _branch_w :: Double
409 , _branch_t :: Double
410 , _branch_label :: Text
411 , _branch_meta :: Map Text [Double]
412 } deriving (Generic, Show, Eq)
413
414 data PhyloExport =
415 PhyloExport
416 { _export_groups :: [PhyloGroup]
417 , _export_branches :: [PhyloBranch]
418 } deriving (Generic, Show)
419
420 ----------------
421 -- | Lenses | --
422 ----------------
423
424 makeLenses ''Config
425 makeLenses ''Proximity
426 makeLenses ''SeaElevation
427 makeLenses ''Quality
428 makeLenses ''Clique
429 makeLenses ''PhyloLabel
430 makeLenses ''TimeUnit
431 makeLenses ''PhyloFoundations
432 makeLenses ''PhyloClique
433 makeLenses ''Phylo
434 makeLenses ''PhyloPeriod
435 makeLenses ''PhyloLevel
436 makeLenses ''PhyloGroup
437 makeLenses ''PhyloParam
438 makeLenses ''PhyloExport
439 makeLenses ''PhyloBranch
440
441 ------------------------
442 -- | JSON instances | --
443 ------------------------
444
445 instance FromJSON Phylo
446 instance ToJSON Phylo
447 instance FromJSON PhyloParam
448 instance ToJSON PhyloParam
449 instance FromJSON PhyloPeriod
450 instance ToJSON PhyloPeriod
451 instance FromJSON PhyloLevel
452 instance ToJSON PhyloLevel
453 instance FromJSON Software
454 instance ToJSON Software
455 instance FromJSON PhyloGroup
456 instance ToJSON PhyloGroup
457
458 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)