]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
boxshape
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 , _wlj_thresholdInit :: Double
65 , _wlj_thresholdStep :: Double }
66 | Hamming
67 deriving (Show,Generic,Eq)
68
69
70 data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
71
72 data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
73
74 data Synchrony =
75 ByProximityThreshold
76 { _bpt_threshold :: Double
77 , _bpt_sensibility :: Double
78 , _bpt_scope :: SynchronyScope
79 , _bpt_strategy :: SynchronyStrategy }
80 | ByProximityDistribution
81 { _bpd_sensibility :: Double
82 , _bpd_strategy :: SynchronyStrategy }
83 deriving (Show,Generic,Eq)
84
85
86 data TimeUnit =
87 Year
88 { _year_period :: Int
89 , _year_step :: Int
90 , _year_matchingFrame :: Int }
91 deriving (Show,Generic,Eq)
92
93
94 data Clique =
95 Fis
96 { _fis_support :: Int
97 , _fis_size :: Int }
98 | MaxClique
99 { _mcl_size :: Int }
100 deriving (Show,Generic,Eq)
101
102
103 data Quality =
104 Quality { _qua_granularity :: Double
105 , _qua_minBranch :: Int }
106 deriving (Show,Generic,Eq)
107
108
109 data Config =
110 Config { corpusPath :: FilePath
111 , listPath :: FilePath
112 , outputPath :: FilePath
113 , corpusParser :: CorpusParser
114 , phyloName :: Text
115 , phyloLevel :: Int
116 , phyloProximity :: Proximity
117 , phyloSynchrony :: Synchrony
118 , phyloQuality :: Quality
119 , timeUnit :: TimeUnit
120 , clique :: Clique
121 , exportLabel :: [PhyloLabel]
122 , exportSort :: Sort
123 , exportFilter :: [Filter]
124 } deriving (Show,Generic,Eq)
125
126
127 defaultConfig :: Config
128 defaultConfig =
129 Config { corpusPath = ""
130 , listPath = ""
131 , outputPath = ""
132 , corpusParser = Csv 1000
133 , phyloName = pack "Default Phylo"
134 , phyloLevel = 2
135 , phyloProximity = WeightedLogJaccard 10 0 0.1
136 , phyloSynchrony = ByProximityThreshold 0.1 10 AllBranches MergeAllGroups
137 , phyloQuality = Quality 0.1 1
138 , timeUnit = Year 3 1 5
139 , clique = Fis 1 5
140 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
141 , exportSort = ByHierarchy
142 , exportFilter = [ByBranchSize 2]
143 }
144
145 instance FromJSON Config
146 instance ToJSON Config
147 instance FromJSON CorpusParser
148 instance ToJSON CorpusParser
149 instance FromJSON Proximity
150 instance ToJSON Proximity
151 instance FromJSON TimeUnit
152 instance ToJSON TimeUnit
153 instance FromJSON Clique
154 instance ToJSON Clique
155 instance FromJSON PhyloLabel
156 instance ToJSON PhyloLabel
157 instance FromJSON Tagger
158 instance ToJSON Tagger
159 instance FromJSON Sort
160 instance ToJSON Sort
161 instance FromJSON Order
162 instance ToJSON Order
163 instance FromJSON Filter
164 instance ToJSON Filter
165 instance FromJSON SynchronyScope
166 instance ToJSON SynchronyScope
167 instance FromJSON SynchronyStrategy
168 instance ToJSON SynchronyStrategy
169 instance FromJSON Synchrony
170 instance ToJSON Synchrony
171 instance FromJSON Quality
172 instance ToJSON Quality
173
174
175 -- | Software parameters
176 data Software =
177 Software { _software_name :: Text
178 , _software_version :: Text
179 } deriving (Generic, Show, Eq)
180
181 defaultSoftware :: Software
182 defaultSoftware =
183 Software { _software_name = pack "Gargantext"
184 , _software_version = pack "v4" }
185
186
187 -- | Global parameters of a Phylo
188 data PhyloParam =
189 PhyloParam { _phyloParam_version :: Text
190 , _phyloParam_software :: Software
191 , _phyloParam_config :: Config
192 } deriving (Generic, Show, Eq)
193
194 defaultPhyloParam :: PhyloParam
195 defaultPhyloParam =
196 PhyloParam { _phyloParam_version = pack "v2.adaptative"
197 , _phyloParam_software = defaultSoftware
198 , _phyloParam_config = defaultConfig }
199
200
201 ------------------
202 -- | Document | --
203 ------------------
204
205
206 -- | Date : a simple Integer
207 type Date = Int
208
209 -- | Ngrams : a contiguous sequence of n terms
210 type Ngrams = Text
211
212 -- | Document : a piece of Text linked to a Date
213 data Document = Document
214 { date :: Date
215 , text :: [Ngrams]
216 } deriving (Eq,Show,Generic,NFData)
217
218
219 --------------------
220 -- | Foundation | --
221 --------------------
222
223
224 -- | The Foundations of a Phylo created from a given TermList
225 data PhyloFoundations = PhyloFoundations
226 { _foundations_roots :: !(Vector Ngrams)
227 , _foundations_mapList :: TermList
228 } deriving (Generic, Show, Eq)
229
230
231 ---------------------------
232 -- | Coocurency Matrix | --
233 ---------------------------
234
235
236 -- | Cooc : a coocurency matrix between two ngrams
237 type Cooc = Map (Int,Int) Double
238
239
240 -------------------
241 -- | Phylomemy | --
242 -------------------
243
244
245 -- | Phylo datatype of a phylomemy
246 -- foundations : the foundations of the phylo
247 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
248 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
249 -- param : the parameters of the phylomemy (with the user's configuration)
250 -- periods : the temporal steps of a phylomemy
251 data Phylo =
252 Phylo { _phylo_foundations :: PhyloFoundations
253 , _phylo_timeCooc :: !(Map Date Cooc)
254 , _phylo_timeDocs :: !(Map Date Double)
255 , _phylo_termFreq :: !(Map Int Double)
256 , _phylo_param :: PhyloParam
257 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
258 }
259 deriving (Generic, Show, Eq)
260
261
262 -- | PhyloPeriodId : the id of a given period
263 type PhyloPeriodId = (Date,Date)
264
265 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
266 -- id: tuple (start date, end date) of the temporal step of the phylomemy
267 -- levels: levels of granularity
268 data PhyloPeriod =
269 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
270 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
271 } deriving (Generic, Show, Eq)
272
273
274 -- | Level : a level of clustering
275 type Level = Int
276
277 -- | PhyloLevelId : the id of a level of clustering in a given period
278 type PhyloLevelId = (PhyloPeriodId,Level)
279
280 -- | PhyloLevel : levels of phylomemy on a synchronic axis
281 -- Levels description:
282 -- Level 0: The foundations and the base of the phylo
283 -- Level 1: First level of clustering (the Fis)
284 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
285 data PhyloLevel =
286 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
287 , _phylo_levelLevel :: Level
288 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
289 }
290 deriving (Generic, Show, Eq)
291
292
293 type PhyloGroupId = (PhyloLevelId, Int)
294
295 -- | BranchId : (a level, a sequence of branch index)
296 -- the sequence is a path of heritage from the most to the less specific branch
297 type PhyloBranchId = (Level, [Int])
298
299 -- | PhyloGroup : group of ngrams at each level and period
300 data PhyloGroup =
301 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
302 , _phylo_groupLevel :: Level
303 , _phylo_groupIndex :: Int
304 , _phylo_groupLabel :: Text
305 , _phylo_groupSupport :: Support
306 , _phylo_groupNgrams :: [Int]
307 , _phylo_groupCooc :: !(Cooc)
308 , _phylo_groupBranchId :: PhyloBranchId
309 , _phylo_groupMeta :: Map Text [Double]
310 , _phylo_groupLevelParents :: [Pointer]
311 , _phylo_groupLevelChilds :: [Pointer]
312 , _phylo_groupPeriodParents :: [Pointer]
313 , _phylo_groupPeriodChilds :: [Pointer]
314 }
315 deriving (Generic, Show, Eq, NFData)
316
317 -- | Weight : A generic mesure that can be associated with an Id
318 type Weight = Double
319
320 -- | Pointer : A weighted pointer to a given PhyloGroup
321 type Pointer = (PhyloGroupId, Weight)
322
323 data Filiation = ToParents | ToChilds deriving (Generic, Show)
324 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
325
326
327 ----------------------
328 -- | Phylo Clique | --
329 ----------------------
330
331 -- | Support : Number of Documents where a Clique occurs
332 type Support = Int
333
334 data PhyloClique = PhyloClique
335 { _phyloClique_nodes :: Set Ngrams
336 , _phyloClique_support :: Support
337 , _phyloClique_period :: (Date,Date)
338 } deriving (Generic,NFData,Show,Eq)
339
340
341 ----------------
342 -- | Export | --
343 ----------------
344
345 type DotId = TextLazy.Text
346
347 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
348
349 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
350
351 data Order = Asc | Desc deriving (Show,Generic,Eq)
352
353 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
354
355 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
356
357 data PhyloLabel =
358 BranchLabel
359 { _branch_labelTagger :: Tagger
360 , _branch_labelSize :: Int }
361 | GroupLabel
362 { _group_labelTagger :: Tagger
363 , _group_labelSize :: Int }
364 deriving (Show,Generic,Eq)
365
366 data PhyloBranch =
367 PhyloBranch
368 { _branch_id :: PhyloBranchId
369 , _branch_label :: Text
370 , _branch_meta :: Map Text [Double]
371 } deriving (Generic, Show)
372
373 data PhyloExport =
374 PhyloExport
375 { _export_groups :: [PhyloGroup]
376 , _export_branches :: [PhyloBranch]
377 } deriving (Generic, Show)
378
379 ----------------
380 -- | Lenses | --
381 ----------------
382
383 makeLenses ''Config
384 makeLenses ''Proximity
385 makeLenses ''Quality
386 makeLenses ''Clique
387 makeLenses ''PhyloLabel
388 makeLenses ''TimeUnit
389 makeLenses ''PhyloFoundations
390 makeLenses ''PhyloClique
391 makeLenses ''Phylo
392 makeLenses ''PhyloPeriod
393 makeLenses ''PhyloLevel
394 makeLenses ''PhyloGroup
395 makeLenses ''PhyloParam
396 makeLenses ''PhyloExport
397 makeLenses ''PhyloBranch
398
399 ------------------------
400 -- | JSON instances | --
401 ------------------------
402
403
404 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)