]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
fix the freq
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 , _wlj_thresholdInit :: Double
65 , _wlj_thresholdStep :: Double }
66 | Hamming
67 deriving (Show,Generic,Eq)
68
69
70 data Synchrony =
71 ByProximityThreshold
72 { _bpt_threshold :: Double
73 , _bpt_sensibility :: Double}
74 | ByProximityDistribution
75 { _bpd_sensibility :: Double}
76 deriving (Show,Generic,Eq)
77
78
79 data TimeUnit =
80 Year
81 { _year_period :: Int
82 , _year_step :: Int
83 , _year_matchingFrame :: Int }
84 deriving (Show,Generic,Eq)
85
86
87 data ContextualUnit =
88 Fis
89 { _fis_support :: Int
90 , _fis_size :: Int }
91 deriving (Show,Generic,Eq)
92
93
94 data Quality =
95 Quality { _qua_relevance :: Double
96 , _qua_minBranch :: Int }
97 deriving (Show,Generic,Eq)
98
99
100 data Config =
101 Config { corpusPath :: FilePath
102 , listPath :: FilePath
103 , outputPath :: FilePath
104 , corpusParser :: CorpusParser
105 , phyloName :: Text
106 , phyloLevel :: Int
107 , phyloProximity :: Proximity
108 , phyloSynchrony :: Synchrony
109 , phyloQuality :: Quality
110 , timeUnit :: TimeUnit
111 , contextualUnit :: ContextualUnit
112 , exportLabel :: [PhyloLabel]
113 , exportSort :: Sort
114 , exportFilter :: [Filter]
115 } deriving (Show,Generic,Eq)
116
117
118 defaultConfig :: Config
119 defaultConfig =
120 Config { corpusPath = ""
121 , listPath = ""
122 , outputPath = ""
123 , corpusParser = Csv 1000
124 , phyloName = pack "Default Phylo"
125 , phyloLevel = 2
126 , phyloProximity = WeightedLogJaccard 10 0 0.1
127 , phyloSynchrony = ByProximityDistribution 0
128 , phyloQuality = Quality 10 3
129 , timeUnit = Year 3 1 5
130 , contextualUnit = Fis 1 5
131 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
132 , exportSort = ByHierarchy
133 , exportFilter = [ByBranchSize 2]
134 }
135
136 instance FromJSON Config
137 instance ToJSON Config
138 instance FromJSON CorpusParser
139 instance ToJSON CorpusParser
140 instance FromJSON Proximity
141 instance ToJSON Proximity
142 instance FromJSON TimeUnit
143 instance ToJSON TimeUnit
144 instance FromJSON ContextualUnit
145 instance ToJSON ContextualUnit
146 instance FromJSON PhyloLabel
147 instance ToJSON PhyloLabel
148 instance FromJSON Tagger
149 instance ToJSON Tagger
150 instance FromJSON Sort
151 instance ToJSON Sort
152 instance FromJSON Order
153 instance ToJSON Order
154 instance FromJSON Filter
155 instance ToJSON Filter
156 instance FromJSON Synchrony
157 instance ToJSON Synchrony
158 instance FromJSON Quality
159 instance ToJSON Quality
160
161
162 -- | Software parameters
163 data Software =
164 Software { _software_name :: Text
165 , _software_version :: Text
166 } deriving (Generic, Show, Eq)
167
168 defaultSoftware :: Software
169 defaultSoftware =
170 Software { _software_name = pack "Gargantext"
171 , _software_version = pack "v4" }
172
173
174 -- | Global parameters of a Phylo
175 data PhyloParam =
176 PhyloParam { _phyloParam_version :: Text
177 , _phyloParam_software :: Software
178 , _phyloParam_config :: Config
179 } deriving (Generic, Show, Eq)
180
181 defaultPhyloParam :: PhyloParam
182 defaultPhyloParam =
183 PhyloParam { _phyloParam_version = pack "v2.adaptative"
184 , _phyloParam_software = defaultSoftware
185 , _phyloParam_config = defaultConfig }
186
187
188 ------------------
189 -- | Document | --
190 ------------------
191
192
193 -- | Date : a simple Integer
194 type Date = Int
195
196 -- | Ngrams : a contiguous sequence of n terms
197 type Ngrams = Text
198
199 -- | Document : a piece of Text linked to a Date
200 data Document = Document
201 { date :: Date
202 , text :: [Ngrams]
203 } deriving (Eq,Show,Generic,NFData)
204
205
206 --------------------
207 -- | Foundation | --
208 --------------------
209
210
211 -- | The Foundations of a Phylo created from a given TermList
212 data PhyloFoundations = PhyloFoundations
213 { _foundations_roots :: !(Vector Ngrams)
214 , _foundations_mapList :: TermList
215 } deriving (Generic, Show, Eq)
216
217
218 ---------------------------
219 -- | Coocurency Matrix | --
220 ---------------------------
221
222
223 -- | Cooc : a coocurency matrix between two ngrams
224 type Cooc = Map (Int,Int) Double
225
226
227 -------------------
228 -- | Phylomemy | --
229 -------------------
230
231
232 -- | Phylo datatype of a phylomemy
233 -- foundations : the foundations of the phylo
234 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
235 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
236 -- param : the parameters of the phylomemy (with the user's configuration)
237 -- periods : the temporal steps of a phylomemy
238 data Phylo =
239 Phylo { _phylo_foundations :: PhyloFoundations
240 , _phylo_timeCooc :: !(Map Date Cooc)
241 , _phylo_timeDocs :: !(Map Date Double)
242 , _phylo_termFreq :: !(Map Int Double)
243 , _phylo_param :: PhyloParam
244 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
245 }
246 deriving (Generic, Show, Eq)
247
248
249 -- | PhyloPeriodId : the id of a given period
250 type PhyloPeriodId = (Date,Date)
251
252 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
253 -- id: tuple (start date, end date) of the temporal step of the phylomemy
254 -- levels: levels of granularity
255 data PhyloPeriod =
256 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
257 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
258 } deriving (Generic, Show, Eq)
259
260
261 -- | Level : a level of clustering
262 type Level = Int
263
264 -- | PhyloLevelId : the id of a level of clustering in a given period
265 type PhyloLevelId = (PhyloPeriodId,Level)
266
267 -- | PhyloLevel : levels of phylomemy on a synchronic axis
268 -- Levels description:
269 -- Level 0: The foundations and the base of the phylo
270 -- Level 1: First level of clustering (the Fis)
271 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
272 data PhyloLevel =
273 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
274 , _phylo_levelLevel :: Level
275 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
276 }
277 deriving (Generic, Show, Eq)
278
279
280 type PhyloGroupId = (PhyloLevelId, Int)
281
282 -- | BranchId : (a level, a sequence of branch index)
283 -- the sequence is a path of heritage from the most to the less specific branch
284 type PhyloBranchId = (Level, [Int])
285
286 -- | PhyloGroup : group of ngrams at each level and period
287 data PhyloGroup =
288 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
289 , _phylo_groupLevel :: Level
290 , _phylo_groupIndex :: Int
291 , _phylo_groupLabel :: Text
292 , _phylo_groupSupport :: Support
293 , _phylo_groupNgrams :: [Int]
294 , _phylo_groupCooc :: !(Cooc)
295 , _phylo_groupBranchId :: PhyloBranchId
296 , _phylo_groupMeta :: Map Text [Double]
297 , _phylo_groupLevelParents :: [Pointer]
298 , _phylo_groupLevelChilds :: [Pointer]
299 , _phylo_groupPeriodParents :: [Pointer]
300 , _phylo_groupPeriodChilds :: [Pointer]
301 }
302 deriving (Generic, Show, Eq, NFData)
303
304 -- | Weight : A generic mesure that can be associated with an Id
305 type Weight = Double
306
307 -- | Pointer : A weighted pointer to a given PhyloGroup
308 type Pointer = (PhyloGroupId, Weight)
309
310 data Filiation = ToParents | ToChilds deriving (Generic, Show)
311 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
312
313
314 ---------------------------
315 -- | Frequent Item Set | --
316 ---------------------------
317
318 -- | Clique : Set of ngrams cooccurring in the same Document
319 type Clique = Set Ngrams
320
321 -- | Support : Number of Documents where a Clique occurs
322 type Support = Int
323
324 -- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
325 data PhyloFis = PhyloFis
326 { _phyloFis_clique :: Clique
327 , _phyloFis_support :: Support
328 , _phyloFis_period :: (Date,Date)
329 } deriving (Generic,NFData,Show,Eq)
330
331
332 ----------------
333 -- | Export | --
334 ----------------
335
336 type DotId = TextLazy.Text
337
338 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
339
340 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
341
342 data Order = Asc | Desc deriving (Show,Generic,Eq)
343
344 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
345
346 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
347
348 data PhyloLabel =
349 BranchLabel
350 { _branch_labelTagger :: Tagger
351 , _branch_labelSize :: Int }
352 | GroupLabel
353 { _group_labelTagger :: Tagger
354 , _group_labelSize :: Int }
355 deriving (Show,Generic,Eq)
356
357 data PhyloBranch =
358 PhyloBranch
359 { _branch_id :: PhyloBranchId
360 , _branch_label :: Text
361 , _branch_meta :: Map Text [Double]
362 } deriving (Generic, Show)
363
364 data PhyloExport =
365 PhyloExport
366 { _export_groups :: [PhyloGroup]
367 , _export_branches :: [PhyloBranch]
368 } deriving (Generic, Show)
369
370 ----------------
371 -- | Lenses | --
372 ----------------
373
374 makeLenses ''Config
375 makeLenses ''Proximity
376 makeLenses ''Quality
377 makeLenses ''ContextualUnit
378 makeLenses ''PhyloLabel
379 makeLenses ''TimeUnit
380 makeLenses ''PhyloFoundations
381 makeLenses ''PhyloFis
382 makeLenses ''Phylo
383 makeLenses ''PhyloPeriod
384 makeLenses ''PhyloLevel
385 makeLenses ''PhyloGroup
386 makeLenses ''PhyloParam
387 makeLenses ''PhyloExport
388 makeLenses ''PhyloBranch
389
390 ------------------------
391 -- | JSON instances | --
392 ------------------------
393
394
395 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)