]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
introduce EKG-based monitoring infrastructure
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join, sequence)
29 import Control.Monad.IO.Class (liftIO)
30 import Data.Attoparsec.ByteString (parseOnly, Parser)
31 import Data.Either(Either(..))
32 import Data.Either.Extra (partitionEithers)
33 import Data.List (concat, lookup)
34 import Data.Ord()
35 import Data.String (String())
36 import Data.String()
37 import Data.Text (Text)
38 import Data.Text.Encoding (decodeUtf8)
39 import Data.Tuple.Extra (both, first, second)
40 import System.FilePath (FilePath(), takeExtension)
41 import qualified Data.ByteString as DB
42 import qualified Data.ByteString.Char8 as DBC
43 import qualified Data.ByteString.Lazy as DBL
44 import qualified Data.Map as DM
45 import qualified Data.Text as DT
46 import qualified Prelude as Prelude
47 import System.IO.Temp (emptySystemTempFile)
48
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
51 import Gargantext.Prelude
52 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
53 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
54 -- import Gargantext.Core.Text.Learn (detectLangDefault)
55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
58 ------------------------------------------------------------------------
59
60 type ParseError = String
61 --type Field = Text
62 --type Document = DM.Map Field Text
63 --type FilesParsed = DM.Map FilePath FileParsed
64 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
65 -- , _fileParsed_result :: [Document]
66 -- } deriving (Show)
67
68
69 -- | According to the format of Input file,
70 -- different parser are available.
71 data FileFormat = WOS | RIS | RisPresse
72 | CsvGargV3 | CsvHal
73 | ZIP
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81
82
83 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
84 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
85 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
86 parseFormat RisPresse bs = do
87 docs <- mapM (toDoc RIS)
88 <$> snd
89 <$> enrichWith RisPresse
90 $ partitionEithers
91 $ [runParser' RisPresse bs]
92 pure $ Right docs
93 parseFormat WOS bs = do
94 docs <- mapM (toDoc WOS)
95 <$> snd
96 <$> enrichWith WOS
97 $ partitionEithers
98 $ [runParser' WOS bs]
99 pure $ Right docs
100 parseFormat ZIP bs = do
101 path <- emptySystemTempFile "parsed.zip"
102 DB.writeFile path bs
103 withArchive path $ do
104 files <- DM.keys <$> getEntries
105 filesContents <- mapM getEntry files
106 ddocs <- liftIO $ mapM (parseFormat CsvGargV3) filesContents
107 pure $ concat <$> sequence ddocs
108 parseFormat _ _ = undefined
109
110 -- | Parse file into documents
111 -- TODO manage errors here
112 -- TODO: to debug maybe add the filepath in error message
113 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
114 parseFile CsvHal p = parseHal p
115 parseFile CsvGargV3 p = parseCsv p
116 parseFile RisPresse p = do
117 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
118 pure $ Right docs
119 parseFile WOS p = do
120 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
121 pure $ Right docs
122 parseFile ff p = do
123 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
124 pure $ Right docs
125
126 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
127 -- TODO use language for RIS
128 toDoc ff d = do
129 -- let abstract = lookup "abstract" d
130 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
131
132 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
133
134 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
135
136 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
137 , _hd_doi = lookup "doi" d
138 , _hd_url = lookup "URL" d
139 , _hd_uniqId = Nothing
140 , _hd_uniqIdBdd = Nothing
141 , _hd_page = Nothing
142 , _hd_title = lookup "title" d
143 , _hd_authors = Nothing
144 , _hd_institutes = lookup "authors" d
145 , _hd_source = lookup "source" d
146 , _hd_abstract = lookup "abstract" d
147 , _hd_publication_date = fmap (DT.pack . show) utcTime
148 , _hd_publication_year = pub_year
149 , _hd_publication_month = pub_month
150 , _hd_publication_day = pub_day
151 , _hd_publication_hour = Nothing
152 , _hd_publication_minute = Nothing
153 , _hd_publication_second = Nothing
154 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
155
156 enrichWith :: FileFormat
157 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
158 enrichWith RisPresse = enrichWith' presseEnrich
159 enrichWith WOS = enrichWith' (map (first WOS.keys))
160 enrichWith _ = enrichWith' identity
161
162
163 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
164 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
165 enrichWith' f = second (map both' . map f . concat)
166 where
167 both' = map (both decodeUtf8)
168
169
170
171 readFileWith :: FileFormat -> FilePath
172 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
173 readFileWith format path = do
174 files <- case takeExtension path of
175 ".zip" -> openZip path
176 _ -> pure <$> clean <$> DB.readFile path
177 partitionEithers <$> mapConcurrently (runParser format) files
178
179
180 -- | withParser:
181 -- According to the format of the text, choose the right parser.
182 -- TODO withParser :: FileFormat -> Parser [Document]
183 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
184 withParser WOS = WOS.parser
185 withParser RIS = RIS.parser
186 --withParser ODT = odtParser
187 --withParser XML = xmlParser
188 withParser _ = panic "[ERROR] Parser not implemented yet"
189
190 runParser :: FileFormat -> DB.ByteString
191 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
192 runParser format text = pure $ runParser' format text
193
194 runParser' :: FileFormat -> DB.ByteString
195 -> (Either String [[(DB.ByteString, DB.ByteString)]])
196 runParser' format text = parseOnly (withParser format) text
197
198 openZip :: FilePath -> IO [DB.ByteString]
199 openZip fp = do
200 entries <- withArchive fp (DM.keys <$> getEntries)
201 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
202 pure bs
203
204 cleanText :: Text -> Text
205 cleanText = cs . clean . cs
206
207 clean :: DB.ByteString -> DB.ByteString
208 clean txt = DBC.map clean' txt
209 where
210 clean' '’' = '\''
211 clean' '\r' = ' '
212 clean' '\t' = ' '
213 clean' ';' = '.'
214 clean' c = c