]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/AdaptativePhylo.hs
fix accuracy
[gargantext.git] / src / Gargantext / Viz / AdaptativePhylo.hs
1 {-|
2 Module : Gargantext.Viz.AdaptativePhylo
3 Description : Phylomemy definitions and types.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Specifications of Phylomemy export format.
11
12 Phylomemy can be described as a Temporal Graph with different scale of
13 granularity of group of ngrams (terms and multi-terms).
14
15 The main type is Phylo which is synonym of Phylomemy (only difference is
16 the number of chars).
17
18 References:
19 Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
20 in science evolution — the rise and fall of scientific fields. PloS
21 one 8, e54847.
22 -}
23
24 {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE TemplateHaskell #-}
27 {-# LANGUAGE MultiParamTypeClasses #-}
28
29 module Gargantext.Viz.AdaptativePhylo where
30
31 import Data.Aeson
32 import Data.Aeson.TH (deriveJSON)
33 import Data.Text (Text, pack)
34 import Data.Vector (Vector)
35 import Data.Map (Map)
36 import Data.Set (Set)
37
38 import Gargantext.Core.Utils.Prefix (unPrefix)
39 import Gargantext.Prelude
40 import Gargantext.Text.Context (TermList)
41
42 import GHC.Generics
43 import GHC.IO (FilePath)
44 import Control.DeepSeq (NFData)
45 import Control.Lens (makeLenses)
46
47 import qualified Data.Text.Lazy as TextLazy
48
49
50 ----------------
51 -- | Config | --
52 ----------------
53
54
55 data CorpusParser =
56 Wos {_wos_limit :: Int}
57 | Csv {_csv_limit :: Int}
58 deriving (Show,Generic,Eq)
59
60
61 data Proximity =
62 WeightedLogJaccard
63 { _wlj_sensibility :: Double
64 , _wlj_thresholdInit :: Double
65 , _wlj_thresholdStep :: Double }
66 | Hamming
67 deriving (Show,Generic,Eq)
68
69
70 data Synchrony =
71 ByProximityThreshold
72 { _bpt_threshold :: Double
73 , _bpt_sensibility :: Double}
74 | ByProximityDistribution
75 { _bpd_sensibility :: Double}
76 deriving (Show,Generic,Eq)
77
78
79 data TimeUnit =
80 Year
81 { _year_period :: Int
82 , _year_step :: Int
83 , _year_matchingFrame :: Int }
84 deriving (Show,Generic,Eq)
85
86
87 data ContextualUnit =
88 Fis
89 { _fis_support :: Int
90 , _fis_size :: Int }
91 deriving (Show,Generic,Eq)
92
93
94 data Quality =
95 Quality { _qua_relevance :: Double
96 , _qua_minBranch :: Int }
97 deriving (Show,Generic,Eq)
98
99
100 data Config =
101 Config { corpusPath :: FilePath
102 , listPath :: FilePath
103 , outputPath :: FilePath
104 , corpusParser :: CorpusParser
105 , phyloName :: Text
106 , phyloLevel :: Int
107 , phyloProximity :: Proximity
108 , phyloSynchrony :: Synchrony
109 , phyloQuality :: Quality
110 , timeUnit :: TimeUnit
111 , contextualUnit :: ContextualUnit
112 , exportLabel :: [PhyloLabel]
113 , exportSort :: Sort
114 , exportFilter :: [Filter]
115 } deriving (Show,Generic,Eq)
116
117
118 defaultConfig :: Config
119 defaultConfig =
120 Config { corpusPath = ""
121 , listPath = ""
122 , outputPath = ""
123 , corpusParser = Csv 1000
124 , phyloName = pack "Default Phylo"
125 , phyloLevel = 2
126 , phyloProximity = WeightedLogJaccard 10 0 0.1
127 , phyloSynchrony = ByProximityDistribution 0
128 , phyloQuality = Quality 0.5 3
129 , timeUnit = Year 3 1 5
130 , contextualUnit = Fis 1 5
131 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
132 , exportSort = ByHierarchy
133 , exportFilter = [ByBranchSize 2]
134 }
135
136 instance FromJSON Config
137 instance ToJSON Config
138 instance FromJSON CorpusParser
139 instance ToJSON CorpusParser
140 instance FromJSON Proximity
141 instance ToJSON Proximity
142 instance FromJSON TimeUnit
143 instance ToJSON TimeUnit
144 instance FromJSON ContextualUnit
145 instance ToJSON ContextualUnit
146 instance FromJSON PhyloLabel
147 instance ToJSON PhyloLabel
148 instance FromJSON Tagger
149 instance ToJSON Tagger
150 instance FromJSON Sort
151 instance ToJSON Sort
152 instance FromJSON Order
153 instance ToJSON Order
154 instance FromJSON Filter
155 instance ToJSON Filter
156 instance FromJSON Synchrony
157 instance ToJSON Synchrony
158 instance FromJSON Quality
159 instance ToJSON Quality
160
161
162 -- | Software parameters
163 data Software =
164 Software { _software_name :: Text
165 , _software_version :: Text
166 } deriving (Generic, Show, Eq)
167
168 defaultSoftware :: Software
169 defaultSoftware =
170 Software { _software_name = pack "Gargantext"
171 , _software_version = pack "v4" }
172
173
174 -- | Global parameters of a Phylo
175 data PhyloParam =
176 PhyloParam { _phyloParam_version :: Text
177 , _phyloParam_software :: Software
178 , _phyloParam_config :: Config
179 } deriving (Generic, Show, Eq)
180
181 defaultPhyloParam :: PhyloParam
182 defaultPhyloParam =
183 PhyloParam { _phyloParam_version = pack "v2.adaptative"
184 , _phyloParam_software = defaultSoftware
185 , _phyloParam_config = defaultConfig }
186
187
188 ------------------
189 -- | Document | --
190 ------------------
191
192
193 -- | Date : a simple Integer
194 type Date = Int
195
196 -- | Ngrams : a contiguous sequence of n terms
197 type Ngrams = Text
198
199 -- | Document : a piece of Text linked to a Date
200 data Document = Document
201 { date :: Date
202 , text :: [Ngrams]
203 } deriving (Eq,Show,Generic,NFData)
204
205
206 --------------------
207 -- | Foundation | --
208 --------------------
209
210
211 -- | The Foundations of a Phylo created from a given TermList
212 data PhyloFoundations = PhyloFoundations
213 { _foundations_roots :: !(Vector Ngrams)
214 , _foundations_mapList :: TermList
215 } deriving (Generic, Show, Eq)
216
217
218 ---------------------------
219 -- | Coocurency Matrix | --
220 ---------------------------
221
222
223 -- | Cooc : a coocurency matrix between two ngrams
224 type Cooc = Map (Int,Int) Double
225
226
227 -------------------
228 -- | Phylomemy | --
229 -------------------
230
231
232 -- | Phylo datatype of a phylomemy
233 -- foundations : the foundations of the phylo
234 -- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
235 -- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
236 -- param : the parameters of the phylomemy (with the user's configuration)
237 -- periods : the temporal steps of a phylomemy
238 data Phylo =
239 Phylo { _phylo_foundations :: PhyloFoundations
240 , _phylo_timeCooc :: !(Map Date Cooc)
241 , _phylo_timeDocs :: !(Map Date Double)
242 , _phylo_param :: PhyloParam
243 , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
244 }
245 deriving (Generic, Show, Eq)
246
247
248 -- | PhyloPeriodId : the id of a given period
249 type PhyloPeriodId = (Date,Date)
250
251 -- | PhyloPeriod : steps of a phylomemy on a temporal axis
252 -- id: tuple (start date, end date) of the temporal step of the phylomemy
253 -- levels: levels of granularity
254 data PhyloPeriod =
255 PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
256 , _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
257 } deriving (Generic, Show, Eq)
258
259
260 -- | Level : a level of clustering
261 type Level = Int
262
263 -- | PhyloLevelId : the id of a level of clustering in a given period
264 type PhyloLevelId = (PhyloPeriodId,Level)
265
266 -- | PhyloLevel : levels of phylomemy on a synchronic axis
267 -- Levels description:
268 -- Level 0: The foundations and the base of the phylo
269 -- Level 1: First level of clustering (the Fis)
270 -- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
271 data PhyloLevel =
272 PhyloLevel { _phylo_levelPeriod :: (Date,Date)
273 , _phylo_levelLevel :: Level
274 , _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
275 }
276 deriving (Generic, Show, Eq)
277
278
279 type PhyloGroupId = (PhyloLevelId, Int)
280
281 -- | BranchId : (a level, a sequence of branch index)
282 -- the sequence is a path of heritage from the most to the less specific branch
283 type PhyloBranchId = (Level, [Int])
284
285 -- | PhyloGroup : group of ngrams at each level and period
286 data PhyloGroup =
287 PhyloGroup { _phylo_groupPeriod :: (Date,Date)
288 , _phylo_groupLevel :: Level
289 , _phylo_groupIndex :: Int
290 , _phylo_groupLabel :: Text
291 , _phylo_groupSupport :: Support
292 , _phylo_groupNgrams :: [Int]
293 , _phylo_groupCooc :: !(Cooc)
294 , _phylo_groupBranchId :: PhyloBranchId
295 , _phylo_groupMeta :: Map Text [Double]
296 , _phylo_groupLevelParents :: [Pointer]
297 , _phylo_groupLevelChilds :: [Pointer]
298 , _phylo_groupPeriodParents :: [Pointer]
299 , _phylo_groupPeriodChilds :: [Pointer]
300 }
301 deriving (Generic, Show, Eq, NFData)
302
303 -- | Weight : A generic mesure that can be associated with an Id
304 type Weight = Double
305
306 -- | Pointer : A weighted pointer to a given PhyloGroup
307 type Pointer = (PhyloGroupId, Weight)
308
309 data Filiation = ToParents | ToChilds deriving (Generic, Show)
310 data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
311
312
313 ---------------------------
314 -- | Frequent Item Set | --
315 ---------------------------
316
317 -- | Clique : Set of ngrams cooccurring in the same Document
318 type Clique = Set Ngrams
319
320 -- | Support : Number of Documents where a Clique occurs
321 type Support = Int
322
323 -- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
324 data PhyloFis = PhyloFis
325 { _phyloFis_clique :: Clique
326 , _phyloFis_support :: Support
327 , _phyloFis_period :: (Date,Date)
328 } deriving (Generic,NFData,Show,Eq)
329
330
331 ----------------
332 -- | Export | --
333 ----------------
334
335 type DotId = TextLazy.Text
336
337 data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
338
339 data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
340
341 data Order = Asc | Desc deriving (Show,Generic,Eq)
342
343 data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
344
345 data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
346
347 data PhyloLabel =
348 BranchLabel
349 { _branch_labelTagger :: Tagger
350 , _branch_labelSize :: Int }
351 | GroupLabel
352 { _group_labelTagger :: Tagger
353 , _group_labelSize :: Int }
354 deriving (Show,Generic,Eq)
355
356 data PhyloBranch =
357 PhyloBranch
358 { _branch_id :: PhyloBranchId
359 , _branch_label :: Text
360 , _branch_meta :: Map Text [Double]
361 } deriving (Generic, Show)
362
363 data PhyloExport =
364 PhyloExport
365 { _export_groups :: [PhyloGroup]
366 , _export_branches :: [PhyloBranch]
367 } deriving (Generic, Show)
368
369 ----------------
370 -- | Lenses | --
371 ----------------
372
373 makeLenses ''Config
374 makeLenses ''Proximity
375 makeLenses ''Quality
376 makeLenses ''ContextualUnit
377 makeLenses ''PhyloLabel
378 makeLenses ''TimeUnit
379 makeLenses ''PhyloFoundations
380 makeLenses ''PhyloFis
381 makeLenses ''Phylo
382 makeLenses ''PhyloPeriod
383 makeLenses ''PhyloLevel
384 makeLenses ''PhyloGroup
385 makeLenses ''PhyloParam
386 makeLenses ''PhyloExport
387 makeLenses ''PhyloBranch
388
389 ------------------------
390 -- | JSON instances | --
391 ------------------------
392
393
394 $(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)