]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
end of phylo clustering
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 , _wlj_thresholdInit :: Double
65 , _wlj_thresholdStep :: Double }
66 | Hamming
67 deriving (Show,Generic,Eq)
68
69
70 data Synchrony =
71 ByProximityThreshold
72 { _bpt_threshold :: Double }
73 | ByProximityDistribution
74 deriving (Show,Generic,Eq)
75
76
77 data TimeUnit =
78 Year
79 { _year_period :: Int
80 , _year_step :: Int
81 , _year_matchingFrame :: Int }
82 deriving (Show,Generic,Eq)
83
84
85 data ContextualUnit =
86 Fis
87 { _fis_support :: Int
88 , _fis_size :: Int }
89 deriving (Show,Generic,Eq)
90
91
92 data Config =
93 Config { corpusPath :: FilePath
94 , listPath :: FilePath
95 , outputPath :: FilePath
96 , corpusParser :: CorpusParser
97 , phyloName :: Text
98 , phyloLevel :: Int
99 , phyloProximity :: Proximity
100 , phyloSynchrony :: Synchrony
101 , timeUnit :: TimeUnit
102 , contextualUnit :: ContextualUnit
103 , exportLabel :: [PhyloLabel]
104 , exportSort :: Sort
105 , exportFilter :: [Filter]
106 } deriving (Show,Generic,Eq)
107
108
109 defaultConfig :: Config
110 defaultConfig =
111 Config { corpusPath = ""
112 , listPath = ""
113 , outputPath = ""
114 , corpusParser = Csv 1000
115 , phyloName = pack "Default Phylo"
116 , phyloLevel = 1
117 , phyloProximity = WeightedLogJaccard 10 0 0.1
118 , phyloSynchrony = ByProximityThreshold 0.1
119 , timeUnit = Year 3 1 5
120 , contextualUnit = Fis 2 4
121 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
122 , exportSort = ByHierarchy
123 , exportFilter = [ByBranchSize 2]
124 }
125
126 instance FromJSON Config
127 instance ToJSON Config
128 instance FromJSON CorpusParser
129 instance ToJSON CorpusParser
130 instance FromJSON Proximity
131 instance ToJSON Proximity
132 instance FromJSON TimeUnit
133 instance ToJSON TimeUnit
134 instance FromJSON ContextualUnit
135 instance ToJSON ContextualUnit
136 instance FromJSON PhyloLabel
137 instance ToJSON PhyloLabel
138 instance FromJSON Tagger
139 instance ToJSON Tagger
140 instance FromJSON Sort
141 instance ToJSON Sort
142 instance FromJSON Order
143 instance ToJSON Order
144 instance FromJSON Filter
145 instance ToJSON Filter
146 instance FromJSON Synchrony
147 instance ToJSON Synchrony
148
149
150 -- | Software parameters
151 data Software =
152 Software { _software_name :: Text
153 , _software_version :: Text
154 } deriving (Generic, Show, Eq)
155
156 defaultSoftware :: Software
157 defaultSoftware =
158 Software { _software_name = pack "Gargantext"
159 , _software_version = pack "v4" }
160
161
162 -- | Global parameters of a Phylo
163 data PhyloParam =
164 PhyloParam { _phyloParam_version :: Text
165 , _phyloParam_software :: Software
166 , _phyloParam_config :: Config
167 } deriving (Generic, Show, Eq)
168
169 defaultPhyloParam :: PhyloParam
170 defaultPhyloParam =
171 PhyloParam { _phyloParam_version = pack "v2.adaptative"
172 , _phyloParam_software = defaultSoftware
173 , _phyloParam_config = defaultConfig }
174
175
176 ------------------
177 -- | Document | --
178 ------------------
179
180
181 -- | Date : a simple Integer
182 type Date = Int
183
184 -- | Ngrams : a contiguous sequence of n terms
185 type Ngrams = Text
186
187 -- | Document : a piece of Text linked to a Date
188 data Document = Document
189 { date :: Date
190 , text :: [Ngrams]
191 } deriving (Eq,Show,Generic,NFData)
192
193
194 --------------------
195 -- | Foundation | --
196 --------------------
197
198
199 -- | The Foundations of a Phylo created from a given TermList
200 data PhyloFoundations = PhyloFoundations
201 { _foundations_roots :: !(Vector Ngrams)
202 , _foundations_mapList :: TermList
203 } deriving (Generic, Show, Eq)
204
205
206 ---------------------------
207 -- | Coocurency Matrix | --
208 ---------------------------
209
210
211 -- | Cooc : a coocurency matrix between two ngrams
212 type Cooc = Map (Int,Int) Double
213
214
215 -------------------
216 -- | Phylomemy | --
217 -------------------
218
219
220 -- | Phylo datatype of a phylomemy
221 -- foundations : the foundations of the phylo
222 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
223 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
224 -- param : the parameters of the phylomemy (with the user's configuration)
225 -- periods : the temporal steps of a phylomemy
226 data Phylo =
227 Phylo { _phylo_foundations :: PhyloFoundations
228 , _phylo_timeCooc :: !(Map Date Cooc)
229 , _phylo_timeDocs :: !(Map Date Double)
230 , _phylo_param :: PhyloParam
231 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
232 }
233 deriving (Generic, Show, Eq)
234
235
236 -- | PhyloPeriodId : the id of a given period
237 type PhyloPeriodId = (Date,Date)
238
239 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
240 -- id: tuple (start date, end date) of the temporal step of the phylomemy
241 -- levels: levels of granularity
242 data PhyloPeriod =
243 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
244 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
245 } deriving (Generic, Show, Eq)
246
247
248 -- | Level : a level of clustering
249 type Level = Int
250
251 -- | PhyloLevelId : the id of a level of clustering in a given period
252 type PhyloLevelId = (PhyloPeriodId,Level)
253
254 -- | PhyloLevel : levels of phylomemy on a synchronic axis
255 -- Levels description:
256 -- Level 0: The foundations and the base of the phylo
257 -- Level 1: First level of clustering (the Fis)
258 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
259 data PhyloLevel =
260 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
261 , _phylo_levelLevel :: Level
262 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
263 }
264 deriving (Generic, Show, Eq)
265
266
267 type PhyloGroupId = (PhyloLevelId, Int)
268
269 -- | BranchId : (a level, a sequence of branch index)
270 -- the sequence is a path of heritage from the most to the less specific branch
271 type PhyloBranchId = (Level, [Int])
272
273 -- | PhyloGroup : group of ngrams at each level and period
274 data PhyloGroup =
275 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
276 , _phylo_groupLevel :: Level
277 , _phylo_groupIndex :: Int
278 , _phylo_groupLabel :: Text
279 , _phylo_groupSupport :: Support
280 , _phylo_groupNgrams :: [Int]
281 , _phylo_groupCooc :: !(Cooc)
282 , _phylo_groupBranchId :: PhyloBranchId
283 , _phylo_groupMeta :: Map Text [Double]
284 , _phylo_groupLevelParents :: [Pointer]
285 , _phylo_groupLevelChilds :: [Pointer]
286 , _phylo_groupPeriodParents :: [Pointer]
287 , _phylo_groupPeriodChilds :: [Pointer]
288 }
289 deriving (Generic, Show, Eq)
290
291 -- | Weight : A generic mesure that can be associated with an Id
292 type Weight = Double
293
294 -- | Pointer : A weighted pointer to a given PhyloGroup
295 type Pointer = (PhyloGroupId, Weight)
296
297 data Filiation = ToParents | ToChilds deriving (Generic, Show)
298 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
299
300
301 ---------------------------
302 -- | Frequent Item Set | --
303 ---------------------------
304
305 -- | Clique : Set of ngrams cooccurring in the same Document
306 type Clique = Set Ngrams
307
308 -- | Support : Number of Documents where a Clique occurs
309 type Support = Int
310
311 -- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
312 data PhyloFis = PhyloFis
313 { _phyloFis_clique :: Clique
314 , _phyloFis_support :: Support
315 , _phyloFis_period :: (Date,Date)
316 } deriving (Generic,NFData,Show,Eq)
317
318
319 ----------------
320 -- | Export | --
321 ----------------
322
323 type DotId = TextLazy.Text
324
325 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
326
327 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
328
329 data Order = Asc | Desc deriving (Show,Generic,Eq)
330
331 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
332
333 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
334
335 data PhyloLabel =
336 BranchLabel
337 { _branch_labelTagger :: Tagger
338 , _branch_labelSize :: Int }
339 | GroupLabel
340 { _group_labelTagger :: Tagger
341 , _group_labelSize :: Int }
342 deriving (Show,Generic,Eq)
343
344 data PhyloBranch =
345 PhyloBranch
346 { _branch_id :: PhyloBranchId
347 , _branch_label :: Text
348 , _branch_meta :: Map Text [Double]
349 } deriving (Generic, Show)
350
351 data PhyloExport =
352 PhyloExport
353 { _export_groups :: [PhyloGroup]
354 , _export_branches :: [PhyloBranch]
355 } deriving (Generic, Show)
356
357 ----------------
358 -- | Lenses | --
359 ----------------
360
361 makeLenses ''Config
362 makeLenses ''Proximity
363 makeLenses ''ContextualUnit
364 makeLenses ''PhyloLabel
365 makeLenses ''TimeUnit
366 makeLenses ''PhyloFoundations
367 makeLenses ''PhyloFis
368 makeLenses ''Phylo
369 makeLenses ''PhyloPeriod
370 makeLenses ''PhyloLevel
371 makeLenses ''PhyloGroup
372 makeLenses ''PhyloParam
373 makeLenses ''PhyloExport
374 makeLenses ''PhyloBranch
375
376 ------------------------
377 -- | JSON instances | --
378 ------------------------
379
380
381 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)