]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Aggregates/Document.hs
[DOC] Parser Pubmed.
[gargantext.git] / src / Gargantext / Viz / Phylo / Aggregates / Document.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.Aggregates.Document
18 where
19
20 import Control.Lens hiding (both, Level)
21
22 import Data.List (last,nub,(++))
23 import Data.Map (Map,member)
24 import Data.Text (Text)
25 import Data.Tuple (fst, snd)
26 import Data.Vector (Vector)
27 import Gargantext.Prelude
28 import Gargantext.Text.Terms.Mono (monoTexts)
29 import Gargantext.Viz.Phylo
30 import Gargantext.Viz.Phylo.Tools
31 import qualified Data.List as List
32 import qualified Data.Map as Map
33 import qualified Data.Vector as Vector
34
35
36 -- | To init a list of Periods framed by a starting Date and an ending Date
37 initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
38 initPeriods g s (start,end) = map (\l -> (head' "Doc" l, last l))
39 $ chunkAlong g s [start .. end]
40
41
42 -- | To group a list of Documents by fixed periods
43 groupDocsByPeriod :: (Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
44 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
45 groupDocsByPeriod f pds es = Map.fromList $ zip pds $ map (inPeriode f es) pds
46 where
47 --------------------------------------
48 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
49 inPeriode f' h (start,end) =
50 fst $ List.partition (\d -> f' d >= start && f' d <= end) h
51 --------------------------------------
52
53 reduceByPeaks :: Map Ngrams Ngrams -> [Ngrams] -> [Ngrams]
54 reduceByPeaks m ns = (\(f,s) -> f ++ (nub s))
55 $ foldl (\mem n -> if member n m
56 then (fst mem,(snd mem) ++ [m Map.! n])
57 else ((fst mem) ++ [n],snd mem)
58 ) ([],[]) ns
59
60 -- | To parse a list of Documents by filtering on a Vector of Ngrams
61 parseDocs :: Vector Ngrams -> PhyloPeaks -> [(Date,Text)] -> [Document]
62 parseDocs fds peaks c = map (\(d,t)
63 -> Document d ( reduceByPeaks mPeaks
64 $ filter (\x -> Vector.elem x fds)
65 $ monoTexts t)) c
66 where
67 --------------------------------------
68 mPeaks :: Map Ngrams Ngrams
69 mPeaks = forestToMap (peaks ^. phylo_peaksForest)
70 --------------------------------------
71
72
73 -- | To transform a Corpus of texts into a Map of aggregated Documents grouped by Periods
74 corpusToDocs :: [(Date,Text)] -> Phylo -> Map (Date,Date) [Document]
75 corpusToDocs c p = groupDocsByPeriod date (getPhyloPeriods p)
76 $ parseDocs (getFoundations p) (getPeaks p) c