]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/AdaptativePhylo.hs
working on a new distance in phylo
[gargantext.git] / src / Gargantext / Core / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.AdaptativePhylo where
28
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Text (Text, pack)
32 import Data.Vector (Vector)
33 import Data.Map (Map)
34
35 import Gargantext.Core.Utils.Prefix (unPrefix)
36 import Gargantext.Prelude
37 import Gargantext.Core.Text.Context (TermList)
38
39 import GHC.Generics
40 import GHC.IO (FilePath)
41 import Control.DeepSeq (NFData)
42 import Control.Lens (makeLenses)
43
44 import qualified Data.Text.Lazy as TextLazy
45
46
47 ----------------
48 -- | Config | --
49 ----------------
50
51
52 data CorpusParser =
53 Wos {_wos_limit :: Int}
54 | Csv {_csv_limit :: Int}
55 deriving (Show,Generic,Eq)
56
57 data SeaElevation =
58 Constante
59 { _cons_start :: Double
60 , _cons_step :: Double }
61 | Adaptative
62 { _adap_granularity :: Double }
63 deriving (Show,Generic,Eq)
64
65 data Proximity =
66 WeightedLogJaccard
67 { _wlj_sensibility :: Double
68 {-
69 -- , _wlj_thresholdInit :: Double
70 -- , _wlj_thresholdStep :: Double
71 -- | max height for sea level in temporal matching
72 -- , _wlj_elevation :: Double
73 -}
74 }
75 | WeightedLogSim
76 { _wlj_sensibility :: Double
77 {-
78 -- , _wlj_thresholdInit :: Double
79 -- , _wlj_thresholdStep :: Double
80 -- | max height for sea level in temporal matching
81 -- , _wlj_elevation :: Double
82 -}
83 }
84 | Hamming
85 deriving (Show,Generic,Eq)
86
87
88 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
89
90 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
91
92 data Synchrony =
93 ByProximityThreshold
94 { _bpt_threshold :: Double
95 , _bpt_sensibility :: Double
96 , _bpt_scope :: SynchronyScope
97 , _bpt_strategy :: SynchronyStrategy }
98 | ByProximityDistribution
99 { _bpd_sensibility :: Double
100 , _bpd_strategy :: SynchronyStrategy }
101 deriving (Show,Generic,Eq)
102
103
104 data TimeUnit =
105 Year
106 { _year_period :: Int
107 , _year_step :: Int
108 , _year_matchingFrame :: Int }
109 deriving (Show,Generic,Eq)
110
111 data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
112
113 data Clique =
114 Fis
115 { _fis_support :: Int
116 , _fis_size :: Int }
117 | MaxClique
118 { _mcl_size :: Int
119 , _mcl_threshold :: Double
120 , _mcl_filter :: CliqueFilter }
121 deriving (Show,Generic,Eq)
122
123
124 data Quality =
125 Quality { _qua_granularity :: Double
126 , _qua_minBranch :: Int }
127 deriving (Show,Generic,Eq)
128
129
130 data Config =
131 Config { corpusPath :: FilePath
132 , listPath :: FilePath
133 , outputPath :: FilePath
134 , corpusParser :: CorpusParser
135 , phyloName :: Text
136 , phyloLevel :: Int
137 , phyloProximity :: Proximity
138 , seaElevation :: SeaElevation
139 , findAncestors :: Bool
140 , phyloSynchrony :: Synchrony
141 , phyloQuality :: Quality
142 , timeUnit :: TimeUnit
143 , clique :: Clique
144 , exportLabel :: [PhyloLabel]
145 , exportSort :: Sort
146 , exportFilter :: [Filter]
147 } deriving (Show,Generic,Eq)
148
149
150 defaultConfig :: Config
151 defaultConfig =
152 Config { corpusPath = ""
153 , listPath = ""
154 , outputPath = ""
155 , corpusParser = Csv 1000
156 , phyloName = pack "Default Phylo"
157 , phyloLevel = 2
158 , phyloProximity = WeightedLogJaccard 10
159 , seaElevation = Constante 0.1 0.1
160 , findAncestors = True
161 , phyloSynchrony = ByProximityThreshold 0.1 10 SiblingBranches MergeAllGroups
162 , phyloQuality = Quality 0 1
163 , timeUnit = Year 3 1 5
164 , clique = MaxClique 0 3 ByNeighbours
165 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
166 , exportSort = ByHierarchy
167 , exportFilter = [ByBranchSize 2]
168 }
169
170 instance FromJSON Config
171 instance ToJSON Config
172 instance FromJSON CorpusParser
173 instance ToJSON CorpusParser
174 instance FromJSON Proximity
175 instance ToJSON Proximity
176 instance FromJSON SeaElevation
177 instance ToJSON SeaElevation
178 instance FromJSON TimeUnit
179 instance ToJSON TimeUnit
180 instance FromJSON CliqueFilter
181 instance ToJSON CliqueFilter
182 instance FromJSON Clique
183 instance ToJSON Clique
184 instance FromJSON PhyloLabel
185 instance ToJSON PhyloLabel
186 instance FromJSON Tagger
187 instance ToJSON Tagger
188 instance FromJSON Sort
189 instance ToJSON Sort
190 instance FromJSON Order
191 instance ToJSON Order
192 instance FromJSON Filter
193 instance ToJSON Filter
194 instance FromJSON SynchronyScope
195 instance ToJSON SynchronyScope
196 instance FromJSON SynchronyStrategy
197 instance ToJSON SynchronyStrategy
198 instance FromJSON Synchrony
199 instance ToJSON Synchrony
200 instance FromJSON Quality
201 instance ToJSON Quality
202
203
204 -- | Software parameters
205 data Software =
206 Software { _software_name :: Text
207 , _software_version :: Text
208 } deriving (Generic, Show, Eq)
209
210 defaultSoftware :: Software
211 defaultSoftware =
212 Software { _software_name = pack "Gargantext"
213 , _software_version = pack "v4" }
214
215
216 -- | Global parameters of a Phylo
217 data PhyloParam =
218 PhyloParam { _phyloParam_version :: Text
219 , _phyloParam_software :: Software
220 , _phyloParam_config :: Config
221 } deriving (Generic, Show, Eq)
222
223 defaultPhyloParam :: PhyloParam
224 defaultPhyloParam =
225 PhyloParam { _phyloParam_version = pack "v2.adaptative"
226 , _phyloParam_software = defaultSoftware
227 , _phyloParam_config = defaultConfig }
228
229
230 ------------------
231 -- | Document | --
232 ------------------
233
234
235 -- | Date : a simple Integer
236 type Date = Int
237
238 -- | Ngrams : a contiguous sequence of n terms
239 type Ngrams = Text
240
241 -- | Document : a piece of Text linked to a Date
242 data Document = Document
243 { date :: Date
244 , text :: [Ngrams]
245 } deriving (Eq,Show,Generic,NFData)
246
247
248 --------------------
249 -- | Foundation | --
250 --------------------
251
252
253 -- | The Foundations of a Phylo created from a given TermList
254 data PhyloFoundations = PhyloFoundations
255 { _foundations_roots :: !(Vector Ngrams)
256 , _foundations_mapList :: TermList
257 } deriving (Generic, Show, Eq)
258
259
260 ---------------------------
261 -- | Coocurency Matrix | --
262 ---------------------------
263
264
265 -- | Cooc : a coocurency matrix between two ngrams
266 type Cooc = Map (Int,Int) Double
267
268
269 -------------------
270 -- | Phylomemy | --
271 -------------------
272
273
274 -- | Phylo datatype of a phylomemy
275 -- foundations : the foundations of the phylo
276 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
277 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
278 -- param : the parameters of the phylomemy (with the user's configuration)
279 -- periods : the temporal steps of a phylomemy
280 data Phylo =
281 Phylo { _phylo_foundations :: PhyloFoundations
282 , _phylo_timeCooc :: !(Map Date Cooc)
283 , _phylo_timeDocs :: !(Map Date Double)
284 , _phylo_termFreq :: !(Map Int Double)
285 , _phylo_lastTermFreq :: !(Map Int Double)
286 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
287 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
288 , _phylo_param :: PhyloParam
289 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
290 }
291 deriving (Generic, Show, Eq)
292
293
294 -- | PhyloPeriodId : the id of a given period
295 type PhyloPeriodId = (Date,Date)
296
297 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
298 -- id: tuple (start date, end date) of the temporal step of the phylomemy
299 -- levels: levels of granularity
300 data PhyloPeriod =
301 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
302 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
303 } deriving (Generic, Show, Eq)
304
305
306 -- | Level : a level of clustering
307 type Level = Int
308
309 -- | PhyloLevelId : the id of a level of clustering in a given period
310 type PhyloLevelId = (PhyloPeriodId,Level)
311
312 -- | PhyloLevel : levels of phylomemy on a synchronic axis
313 -- Levels description:
314 -- Level 0: The foundations and the base of the phylo
315 -- Level 1: First level of clustering (the Fis)
316 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
317 data PhyloLevel =
318 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
319 , _phylo_levelLevel :: Level
320 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
321 }
322 deriving (Generic, Show, Eq)
323
324
325 type PhyloGroupId = (PhyloLevelId, Int)
326
327 -- | BranchId : (a level, a sequence of branch index)
328 -- the sequence is a path of heritage from the most to the less specific branch
329 type PhyloBranchId = (Level, [Int])
330
331 -- | PhyloGroup : group of ngrams at each level and period
332 data PhyloGroup =
333 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
334 , _phylo_groupLevel :: Level
335 , _phylo_groupIndex :: Int
336 , _phylo_groupLabel :: Text
337 , _phylo_groupSupport :: Support
338 , _phylo_groupNgrams :: [Int]
339 , _phylo_groupCooc :: !(Cooc)
340 , _phylo_groupBranchId :: PhyloBranchId
341 , _phylo_groupMeta :: Map Text [Double]
342 , _phylo_groupLevelParents :: [Pointer]
343 , _phylo_groupLevelChilds :: [Pointer]
344 , _phylo_groupPeriodParents :: [Pointer]
345 , _phylo_groupPeriodChilds :: [Pointer]
346 , _phylo_groupAncestors :: [Pointer]
347 }
348 deriving (Generic, Show, Eq, NFData)
349
350 -- | Weight : A generic mesure that can be associated with an Id
351 type Weight = Double
352
353 -- | Pointer : A weighted pointer to a given PhyloGroup
354 type Pointer = (PhyloGroupId, Weight)
355
356 data Filiation = ToParents | ToChilds deriving (Generic, Show)
357 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
358
359
360 ----------------------
361 -- | Phylo Clique | --
362 ----------------------
363
364 -- | Support : Number of Documents where a Clique occurs
365 type Support = Int
366
367 data PhyloClique = PhyloClique
368 { _phyloClique_nodes :: [Int]
369 , _phyloClique_support :: Support
370 , _phyloClique_period :: (Date,Date)
371 } deriving (Generic,NFData,Show,Eq)
372
373 ----------------
374 -- | Export | --
375 ----------------
376
377 type DotId = TextLazy.Text
378
379 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
380
381 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
382
383 data Order = Asc | Desc deriving (Show,Generic,Eq)
384
385 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
386
387 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
388
389 data PhyloLabel =
390 BranchLabel
391 { _branch_labelTagger :: Tagger
392 , _branch_labelSize :: Int }
393 | GroupLabel
394 { _group_labelTagger :: Tagger
395 , _group_labelSize :: Int }
396 deriving (Show,Generic,Eq)
397
398 data PhyloBranch =
399 PhyloBranch
400 { _branch_id :: PhyloBranchId
401 , _branch_canonId :: [Int]
402 , _branch_seaLevel :: [Double]
403 , _branch_x :: Double
404 , _branch_y :: Double
405 , _branch_w :: Double
406 , _branch_t :: Double
407 , _branch_label :: Text
408 , _branch_meta :: Map Text [Double]
409 } deriving (Generic, Show, Eq)
410
411 data PhyloExport =
412 PhyloExport
413 { _export_groups :: [PhyloGroup]
414 , _export_branches :: [PhyloBranch]
415 } deriving (Generic, Show)
416
417 ----------------
418 -- | Lenses | --
419 ----------------
420
421 makeLenses ''Config
422 makeLenses ''Proximity
423 makeLenses ''SeaElevation
424 makeLenses ''Quality
425 makeLenses ''Clique
426 makeLenses ''PhyloLabel
427 makeLenses ''TimeUnit
428 makeLenses ''PhyloFoundations
429 makeLenses ''PhyloClique
430 makeLenses ''Phylo
431 makeLenses ''PhyloPeriod
432 makeLenses ''PhyloLevel
433 makeLenses ''PhyloGroup
434 makeLenses ''PhyloParam
435 makeLenses ''PhyloExport
436 makeLenses ''PhyloBranch
437
438 ------------------------
439 -- | JSON instances | --
440 ------------------------
441
442
443 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)