]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/AdaptativePhylo.hs
New similarity measure for inter-temporal matching added named WeightedLogSim Adapted...
[gargantext.git] / src / Gargantext / Core / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Core.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveAnyClass #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Core.Viz.AdaptativePhylo where
28
29 import Data.Aeson
30 import Data.Aeson.TH (deriveJSON)
31 import Data.Text (Text, pack)
32 import Data.Vector (Vector)
33 import Data.Map (Map)
34
35 import Gargantext.Core.Utils.Prefix (unPrefix)
36 import Gargantext.Prelude
37 import Gargantext.Core.Text.Context (TermList)
38
39 import GHC.Generics
40 import GHC.IO (FilePath)
41 import Control.DeepSeq (NFData)
42 import Control.Lens (makeLenses)
43
44 import qualified Data.Text.Lazy as TextLazy
45
46
47 ----------------
48 -- | Config | --
49 ----------------
50
51
52 data CorpusParser =
53 Wos {_wos_limit :: Int}
54 | Csv {_csv_limit :: Int}
55 deriving (Show,Generic,Eq)
56
57 data SeaElevation =
58 Constante
59 { _cons_start :: Double
60 , _cons_step :: Double }
61 | Adaptative
62 { _adap_granularity :: Double }
63 deriving (Show,Generic,Eq)
64
65 data Proximity =
66 WeightedLogJaccard
67 { _wlj_sensibility :: Double
68 {-
69 -- , _wlj_thresholdInit :: Double
70 -- , _wlj_thresholdStep :: Double
71 -- | max height for sea level in temporal matching
72 -- , _wlj_elevation :: Double
73 -}
74 }
75 | WeightedLogSim
76 { _wlj_sensibility :: Double
77 {-
78 -- , _wlj_thresholdInit :: Double
79 -- , _wlj_thresholdStep :: Double
80 -- | max height for sea level in temporal matching
81 -- , _wlj_elevation :: Double
82 -}
83 }
84 | Hamming
85 deriving (Show,Generic,Eq)
86
87
88 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
89
90 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
91
92 data Synchrony =
93 ByProximityThreshold
94 { _bpt_threshold :: Double
95 , _bpt_sensibility :: Double
96 , _bpt_scope :: SynchronyScope
97 , _bpt_strategy :: SynchronyStrategy }
98 | ByProximityDistribution
99 { _bpd_sensibility :: Double
100 , _bpd_strategy :: SynchronyStrategy }
101 deriving (Show,Generic,Eq)
102
103
104 data TimeUnit =
105 Year
106 { _year_period :: Int
107 , _year_step :: Int
108 , _year_matchingFrame :: Int }
109 deriving (Show,Generic,Eq)
110
111
112 data Clique =
113 Fis
114 { _fis_support :: Int
115 , _fis_size :: Int }
116 | MaxClique
117 { _mcl_size :: Int }
118 deriving (Show,Generic,Eq)
119
120
121 data Quality =
122 Quality { _qua_granularity :: Double
123 , _qua_minBranch :: Int }
124 deriving (Show,Generic,Eq)
125
126
127 data Config =
128 Config { corpusPath :: FilePath
129 , listPath :: FilePath
130 , outputPath :: FilePath
131 , corpusParser :: CorpusParser
132 , phyloName :: Text
133 , phyloLevel :: Int
134 , phyloProximity :: Proximity
135 , seaElevation :: SeaElevation
136 , findAncestors :: Bool
137 , phyloSynchrony :: Synchrony
138 , phyloQuality :: Quality
139 , timeUnit :: TimeUnit
140 , clique :: Clique
141 , exportLabel :: [PhyloLabel]
142 , exportSort :: Sort
143 , exportFilter :: [Filter]
144 } deriving (Show,Generic,Eq)
145
146
147 defaultConfig :: Config
148 defaultConfig =
149 Config { corpusPath = ""
150 , listPath = ""
151 , outputPath = ""
152 , corpusParser = Csv 1000
153 , phyloName = pack "Default Phylo"
154 , phyloLevel = 2
155 , phyloProximity = WeightedLogJaccard 10
156 , seaElevation = Constante 0.1 0.1
157 , findAncestors = True
158 , phyloSynchrony = ByProximityThreshold 0.2 10 SiblingBranches MergeAllGroups
159 , phyloQuality = Quality 0 1
160 , timeUnit = Year 3 1 5
161 , clique = MaxClique 0
162 , exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
163 , exportSort = ByHierarchy
164 , exportFilter = [ByBranchSize 2]
165 }
166
167 instance FromJSON Config
168 instance ToJSON Config
169 instance FromJSON CorpusParser
170 instance ToJSON CorpusParser
171 instance FromJSON Proximity
172 instance ToJSON Proximity
173 instance FromJSON SeaElevation
174 instance ToJSON SeaElevation
175 instance FromJSON TimeUnit
176 instance ToJSON TimeUnit
177 instance FromJSON Clique
178 instance ToJSON Clique
179 instance FromJSON PhyloLabel
180 instance ToJSON PhyloLabel
181 instance FromJSON Tagger
182 instance ToJSON Tagger
183 instance FromJSON Sort
184 instance ToJSON Sort
185 instance FromJSON Order
186 instance ToJSON Order
187 instance FromJSON Filter
188 instance ToJSON Filter
189 instance FromJSON SynchronyScope
190 instance ToJSON SynchronyScope
191 instance FromJSON SynchronyStrategy
192 instance ToJSON SynchronyStrategy
193 instance FromJSON Synchrony
194 instance ToJSON Synchrony
195 instance FromJSON Quality
196 instance ToJSON Quality
197
198
199 -- | Software parameters
200 data Software =
201 Software { _software_name :: Text
202 , _software_version :: Text
203 } deriving (Generic, Show, Eq)
204
205 defaultSoftware :: Software
206 defaultSoftware =
207 Software { _software_name = pack "Gargantext"
208 , _software_version = pack "v4" }
209
210
211 -- | Global parameters of a Phylo
212 data PhyloParam =
213 PhyloParam { _phyloParam_version :: Text
214 , _phyloParam_software :: Software
215 , _phyloParam_config :: Config
216 } deriving (Generic, Show, Eq)
217
218 defaultPhyloParam :: PhyloParam
219 defaultPhyloParam =
220 PhyloParam { _phyloParam_version = pack "v2.adaptative"
221 , _phyloParam_software = defaultSoftware
222 , _phyloParam_config = defaultConfig }
223
224
225 ------------------
226 -- | Document | --
227 ------------------
228
229
230 -- | Date : a simple Integer
231 type Date = Int
232
233 -- | Ngrams : a contiguous sequence of n terms
234 type Ngrams = Text
235
236 -- | Document : a piece of Text linked to a Date
237 data Document = Document
238 { date :: Date
239 , text :: [Ngrams]
240 } deriving (Eq,Show,Generic,NFData)
241
242
243 --------------------
244 -- | Foundation | --
245 --------------------
246
247
248 -- | The Foundations of a Phylo created from a given TermList
249 data PhyloFoundations = PhyloFoundations
250 { _foundations_roots :: !(Vector Ngrams)
251 , _foundations_mapList :: TermList
252 } deriving (Generic, Show, Eq)
253
254
255 ---------------------------
256 -- | Coocurency Matrix | --
257 ---------------------------
258
259
260 -- | Cooc : a coocurency matrix between two ngrams
261 type Cooc = Map (Int,Int) Double
262
263
264 -------------------
265 -- | Phylomemy | --
266 -------------------
267
268
269 -- | Phylo datatype of a phylomemy
270 -- foundations : the foundations of the phylo
271 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
272 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
273 -- param : the parameters of the phylomemy (with the user's configuration)
274 -- periods : the temporal steps of a phylomemy
275 data Phylo =
276 Phylo { _phylo_foundations :: PhyloFoundations
277 , _phylo_timeCooc :: !(Map Date Cooc)
278 , _phylo_timeDocs :: !(Map Date Double)
279 , _phylo_termFreq :: !(Map Int Double)
280 , _phylo_lastTermFreq :: !(Map Int Double)
281 , _phylo_horizon :: !(Map (PhyloGroupId,PhyloGroupId) Double)
282 , _phylo_groupsProxi :: !(Map (PhyloGroupId,PhyloGroupId) Double)
283 , _phylo_param :: PhyloParam
284 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
285 }
286 deriving (Generic, Show, Eq)
287
288
289 -- | PhyloPeriodId : the id of a given period
290 type PhyloPeriodId = (Date,Date)
291
292 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
293 -- id: tuple (start date, end date) of the temporal step of the phylomemy
294 -- levels: levels of granularity
295 data PhyloPeriod =
296 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
297 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
298 } deriving (Generic, Show, Eq)
299
300
301 -- | Level : a level of clustering
302 type Level = Int
303
304 -- | PhyloLevelId : the id of a level of clustering in a given period
305 type PhyloLevelId = (PhyloPeriodId,Level)
306
307 -- | PhyloLevel : levels of phylomemy on a synchronic axis
308 -- Levels description:
309 -- Level 0: The foundations and the base of the phylo
310 -- Level 1: First level of clustering (the Fis)
311 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
312 data PhyloLevel =
313 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
314 , _phylo_levelLevel :: Level
315 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
316 }
317 deriving (Generic, Show, Eq)
318
319
320 type PhyloGroupId = (PhyloLevelId, Int)
321
322 -- | BranchId : (a level, a sequence of branch index)
323 -- the sequence is a path of heritage from the most to the less specific branch
324 type PhyloBranchId = (Level, [Int])
325
326 -- | PhyloGroup : group of ngrams at each level and period
327 data PhyloGroup =
328 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
329 , _phylo_groupLevel :: Level
330 , _phylo_groupIndex :: Int
331 , _phylo_groupLabel :: Text
332 , _phylo_groupSupport :: Support
333 , _phylo_groupNgrams :: [Int]
334 , _phylo_groupCooc :: !(Cooc)
335 , _phylo_groupBranchId :: PhyloBranchId
336 , _phylo_groupMeta :: Map Text [Double]
337 , _phylo_groupLevelParents :: [Pointer]
338 , _phylo_groupLevelChilds :: [Pointer]
339 , _phylo_groupPeriodParents :: [Pointer]
340 , _phylo_groupPeriodChilds :: [Pointer]
341 , _phylo_groupAncestors :: [Pointer]
342 }
343 deriving (Generic, Show, Eq, NFData)
344
345 -- | Weight : A generic mesure that can be associated with an Id
346 type Weight = Double
347
348 -- | Pointer : A weighted pointer to a given PhyloGroup
349 type Pointer = (PhyloGroupId, Weight)
350
351 data Filiation = ToParents | ToChilds deriving (Generic, Show)
352 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
353
354
355 ----------------------
356 -- | Phylo Clique | --
357 ----------------------
358
359 -- | Support : Number of Documents where a Clique occurs
360 type Support = Int
361
362 data PhyloClique = PhyloClique
363 { _phyloClique_nodes :: [Int]
364 , _phyloClique_support :: Support
365 , _phyloClique_period :: (Date,Date)
366 } deriving (Generic,NFData,Show,Eq)
367
368 ----------------
369 -- | Export | --
370 ----------------
371
372 type DotId = TextLazy.Text
373
374 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
375
376 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
377
378 data Order = Asc | Desc deriving (Show,Generic,Eq)
379
380 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
381
382 data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
383
384 data PhyloLabel =
385 BranchLabel
386 { _branch_labelTagger :: Tagger
387 , _branch_labelSize :: Int }
388 | GroupLabel
389 { _group_labelTagger :: Tagger
390 , _group_labelSize :: Int }
391 deriving (Show,Generic,Eq)
392
393 data PhyloBranch =
394 PhyloBranch
395 { _branch_id :: PhyloBranchId
396 , _branch_canonId :: [Int]
397 , _branch_seaLevel :: [Double]
398 , _branch_x :: Double
399 , _branch_y :: Double
400 , _branch_w :: Double
401 , _branch_t :: Double
402 , _branch_label :: Text
403 , _branch_meta :: Map Text [Double]
404 } deriving (Generic, Show, Eq)
405
406 data PhyloExport =
407 PhyloExport
408 { _export_groups :: [PhyloGroup]
409 , _export_branches :: [PhyloBranch]
410 } deriving (Generic, Show)
411
412 ----------------
413 -- | Lenses | --
414 ----------------
415
416 makeLenses ''Config
417 makeLenses ''Proximity
418 makeLenses ''SeaElevation
419 makeLenses ''Quality
420 makeLenses ''Clique
421 makeLenses ''PhyloLabel
422 makeLenses ''TimeUnit
423 makeLenses ''PhyloFoundations
424 makeLenses ''PhyloClique
425 makeLenses ''Phylo
426 makeLenses ''PhyloPeriod
427 makeLenses ''PhyloLevel
428 makeLenses ''PhyloGroup
429 makeLenses ''PhyloParam
430 makeLenses ''PhyloExport
431 makeLenses ''PhyloBranch
432
433 ------------------------
434 -- | JSON instances | --
435 ------------------------
436
437
438 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)