]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
add new synchronic clustering
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 , _wlj_thresholdInit :: Double
65 , _wlj_thresholdStep :: Double }
66 | Hamming
67 deriving (Show,Generic,Eq)
68
69
70 data Synchrony =
71 ByProximityThreshold
72 { _bpt_threshold :: Double
73 , _bpt_sensibility :: Double}
74 | ByProximityDistribution
75 { _bpd_sensibility :: Double}
76 deriving (Show,Generic,Eq)
77
78
79 data TimeUnit =
80 Year
81 { _year_period :: Int
82 , _year_step :: Int
83 , _year_matchingFrame :: Int }
84 deriving (Show,Generic,Eq)
85
86
87 data ContextualUnit =
88 Fis
89 { _fis_support :: Int
90 , _fis_size :: Int }
91 deriving (Show,Generic,Eq)
92
93
94 data Config =
95 Config { corpusPath :: FilePath
96 , listPath :: FilePath
97 , outputPath :: FilePath
98 , corpusParser :: CorpusParser
99 , phyloName :: Text
100 , phyloLevel :: Int
101 , phyloProximity :: Proximity
102 , phyloSynchrony :: Synchrony
103 , timeUnit :: TimeUnit
104 , contextualUnit :: ContextualUnit
105 , exportLabel :: [PhyloLabel]
106 , exportSort :: Sort
107 , exportFilter :: [Filter]
108 } deriving (Show,Generic,Eq)
109
110
111 defaultConfig :: Config
112 defaultConfig =
113 Config { corpusPath = ""
114 , listPath = ""
115 , outputPath = ""
116 , corpusParser = Csv 1000
117 , phyloName = pack "Default Phylo"
118 , phyloLevel = 1
119 , phyloProximity = WeightedLogJaccard 10 0 0.1
120 , phyloSynchrony = ByProximityDistribution 0
121 , timeUnit = Year 3 1 5
122 , contextualUnit = Fis 2 4
123 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
124 , exportSort = ByHierarchy
125 , exportFilter = [ByBranchSize 2]
126 }
127
128 instance FromJSON Config
129 instance ToJSON Config
130 instance FromJSON CorpusParser
131 instance ToJSON CorpusParser
132 instance FromJSON Proximity
133 instance ToJSON Proximity
134 instance FromJSON TimeUnit
135 instance ToJSON TimeUnit
136 instance FromJSON ContextualUnit
137 instance ToJSON ContextualUnit
138 instance FromJSON PhyloLabel
139 instance ToJSON PhyloLabel
140 instance FromJSON Tagger
141 instance ToJSON Tagger
142 instance FromJSON Sort
143 instance ToJSON Sort
144 instance FromJSON Order
145 instance ToJSON Order
146 instance FromJSON Filter
147 instance ToJSON Filter
148 instance FromJSON Synchrony
149 instance ToJSON Synchrony
150
151
152 -- | Software parameters
153 data Software =
154 Software { _software_name :: Text
155 , _software_version :: Text
156 } deriving (Generic, Show, Eq)
157
158 defaultSoftware :: Software
159 defaultSoftware =
160 Software { _software_name = pack "Gargantext"
161 , _software_version = pack "v4" }
162
163
164 -- | Global parameters of a Phylo
165 data PhyloParam =
166 PhyloParam { _phyloParam_version :: Text
167 , _phyloParam_software :: Software
168 , _phyloParam_config :: Config
169 } deriving (Generic, Show, Eq)
170
171 defaultPhyloParam :: PhyloParam
172 defaultPhyloParam =
173 PhyloParam { _phyloParam_version = pack "v2.adaptative"
174 , _phyloParam_software = defaultSoftware
175 , _phyloParam_config = defaultConfig }
176
177
178 ------------------
179 -- | Document | --
180 ------------------
181
182
183 -- | Date : a simple Integer
184 type Date = Int
185
186 -- | Ngrams : a contiguous sequence of n terms
187 type Ngrams = Text
188
189 -- | Document : a piece of Text linked to a Date
190 data Document = Document
191 { date :: Date
192 , text :: [Ngrams]
193 } deriving (Eq,Show,Generic,NFData)
194
195
196 --------------------
197 -- | Foundation | --
198 --------------------
199
200
201 -- | The Foundations of a Phylo created from a given TermList
202 data PhyloFoundations = PhyloFoundations
203 { _foundations_roots :: !(Vector Ngrams)
204 , _foundations_mapList :: TermList
205 } deriving (Generic, Show, Eq)
206
207
208 ---------------------------
209 -- | Coocurency Matrix | --
210 ---------------------------
211
212
213 -- | Cooc : a coocurency matrix between two ngrams
214 type Cooc = Map (Int,Int) Double
215
216
217 -------------------
218 -- | Phylomemy | --
219 -------------------
220
221
222 -- | Phylo datatype of a phylomemy
223 -- foundations : the foundations of the phylo
224 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
225 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
226 -- param : the parameters of the phylomemy (with the user's configuration)
227 -- periods : the temporal steps of a phylomemy
228 data Phylo =
229 Phylo { _phylo_foundations :: PhyloFoundations
230 , _phylo_timeCooc :: !(Map Date Cooc)
231 , _phylo_timeDocs :: !(Map Date Double)
232 , _phylo_param :: PhyloParam
233 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
234 }
235 deriving (Generic, Show, Eq)
236
237
238 -- | PhyloPeriodId : the id of a given period
239 type PhyloPeriodId = (Date,Date)
240
241 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
242 -- id: tuple (start date, end date) of the temporal step of the phylomemy
243 -- levels: levels of granularity
244 data PhyloPeriod =
245 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
246 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
247 } deriving (Generic, Show, Eq)
248
249
250 -- | Level : a level of clustering
251 type Level = Int
252
253 -- | PhyloLevelId : the id of a level of clustering in a given period
254 type PhyloLevelId = (PhyloPeriodId,Level)
255
256 -- | PhyloLevel : levels of phylomemy on a synchronic axis
257 -- Levels description:
258 -- Level 0: The foundations and the base of the phylo
259 -- Level 1: First level of clustering (the Fis)
260 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
261 data PhyloLevel =
262 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
263 , _phylo_levelLevel :: Level
264 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
265 }
266 deriving (Generic, Show, Eq)
267
268
269 type PhyloGroupId = (PhyloLevelId, Int)
270
271 -- | BranchId : (a level, a sequence of branch index)
272 -- the sequence is a path of heritage from the most to the less specific branch
273 type PhyloBranchId = (Level, [Int])
274
275 -- | PhyloGroup : group of ngrams at each level and period
276 data PhyloGroup =
277 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
278 , _phylo_groupLevel :: Level
279 , _phylo_groupIndex :: Int
280 , _phylo_groupLabel :: Text
281 , _phylo_groupSupport :: Support
282 , _phylo_groupNgrams :: [Int]
283 , _phylo_groupCooc :: !(Cooc)
284 , _phylo_groupBranchId :: PhyloBranchId
285 , _phylo_groupMeta :: Map Text [Double]
286 , _phylo_groupLevelParents :: [Pointer]
287 , _phylo_groupLevelChilds :: [Pointer]
288 , _phylo_groupPeriodParents :: [Pointer]
289 , _phylo_groupPeriodChilds :: [Pointer]
290 }
291 deriving (Generic, Show, Eq, NFData)
292
293 -- | Weight : A generic mesure that can be associated with an Id
294 type Weight = Double
295
296 -- | Pointer : A weighted pointer to a given PhyloGroup
297 type Pointer = (PhyloGroupId, Weight)
298
299 data Filiation = ToParents | ToChilds deriving (Generic, Show)
300 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
301
302
303 ---------------------------
304 -- | Frequent Item Set | --
305 ---------------------------
306
307 -- | Clique : Set of ngrams cooccurring in the same Document
308 type Clique = Set Ngrams
309
310 -- | Support : Number of Documents where a Clique occurs
311 type Support = Int
312
313 -- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
314 data PhyloFis = PhyloFis
315 { _phyloFis_clique :: Clique
316 , _phyloFis_support :: Support
317 , _phyloFis_period :: (Date,Date)
318 } deriving (Generic,NFData,Show,Eq)
319
320
321 ----------------
322 -- | Export | --
323 ----------------
324
325 type DotId = TextLazy.Text
326
327 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
328
329 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
330
331 data Order = Asc | Desc deriving (Show,Generic,Eq)
332
333 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
334
335 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
336
337 data PhyloLabel =
338 BranchLabel
339 { _branch_labelTagger :: Tagger
340 , _branch_labelSize :: Int }
341 | GroupLabel
342 { _group_labelTagger :: Tagger
343 , _group_labelSize :: Int }
344 deriving (Show,Generic,Eq)
345
346 data PhyloBranch =
347 PhyloBranch
348 { _branch_id :: PhyloBranchId
349 , _branch_label :: Text
350 , _branch_meta :: Map Text [Double]
351 } deriving (Generic, Show)
352
353 data PhyloExport =
354 PhyloExport
355 { _export_groups :: [PhyloGroup]
356 , _export_branches :: [PhyloBranch]
357 } deriving (Generic, Show)
358
359 ----------------
360 -- | Lenses | --
361 ----------------
362
363 makeLenses ''Config
364 makeLenses ''Proximity
365 makeLenses ''ContextualUnit
366 makeLenses ''PhyloLabel
367 makeLenses ''TimeUnit
368 makeLenses ''PhyloFoundations
369 makeLenses ''PhyloFis
370 makeLenses ''Phylo
371 makeLenses ''PhyloPeriod
372 makeLenses ''PhyloLevel
373 makeLenses ''PhyloGroup
374 makeLenses ''PhyloParam
375 makeLenses ''PhyloExport
376 makeLenses ''PhyloBranch
377
378 ------------------------
379 -- | JSON instances | --
380 ------------------------
381
382
383 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)