]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[Istex] fix getMetadataScroll call
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Data.Attoparsec.ByteString (parseOnly, Parser)
29 import Data.Either(Either(..))
30 import Data.Either.Extra (partitionEithers)
31 import Data.List (concat, lookup)
32 import Data.Ord()
33 import Data.String (String())
34 import Data.String()
35 import Data.Text (Text)
36 import Data.Text.Encoding (decodeUtf8)
37 import Data.Tuple.Extra (both, first, second)
38 import System.FilePath (FilePath(), takeExtension)
39 import qualified Data.ByteString as DB
40 import qualified Data.ByteString.Char8 as DBC
41 import qualified Data.ByteString.Lazy as DBL
42 import qualified Data.Map as DM
43 import qualified Data.Text as DT
44 import qualified Prelude as Prelude
45 import System.IO.Temp (emptySystemTempFile)
46
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
49 import Gargantext.Prelude
50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
57
58 type ParseError = String
59 --type Field = Text
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
64 -- } deriving (Show)
65
66
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
70 | CsvGargV3 | CsvHal
71 | ZIP
72 deriving (Show)
73
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
79
80
81 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
83 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
84 parseFormat RisPresse bs = do
85 let docs = map (toDoc RIS)
86 <$> snd
87 <$> enrichWith RisPresse
88 $ partitionEithers
89 $ [runParser' RisPresse bs]
90 pure $ Right docs
91 parseFormat WOS bs = do
92 let docs = map (toDoc WOS)
93 <$> snd
94 <$> enrichWith WOS
95 $ partitionEithers
96 $ [runParser' WOS bs]
97 pure $ Right docs
98 parseFormat ZIP bs = do
99 path <- emptySystemTempFile "parsed-zip"
100 DB.writeFile path bs
101 parsedZip <- withArchive path $ do
102 DM.keys <$> getEntries
103 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
104 parseFormat _ _ = undefined
105
106 -- | Parse file into documents
107 -- TODO manage errors here
108 -- TODO: to debug maybe add the filepath in error message
109 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
110 parseFile CsvHal p = parseHal p
111 parseFile CsvGargV3 p = parseCsv p
112 parseFile RisPresse p = do
113 docs <- map (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
114 pure $ Right docs
115 parseFile WOS p = do
116 docs <- map (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
117 pure $ Right docs
118 parseFile ff p = do
119 docs <- map (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
120 pure $ Right docs
121
122 toDoc :: FileFormat -> [(Text, Text)] -> HyperdataDocument
123 -- TODO use language for RIS
124 toDoc ff d = do
125 -- let abstract = lookup "abstract" d
126 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
127
128 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
129
130 let (utcTime, (pub_year, pub_month, pub_day)) = Date.dateSplit lang dateToParse
131
132 HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
133 , _hd_doi = lookup "doi" d
134 , _hd_url = lookup "URL" d
135 , _hd_uniqId = Nothing
136 , _hd_uniqIdBdd = Nothing
137 , _hd_page = Nothing
138 , _hd_title = lookup "title" d
139 , _hd_authors = Nothing
140 , _hd_institutes = lookup "authors" d
141 , _hd_source = lookup "source" d
142 , _hd_abstract = lookup "abstract" d
143 , _hd_publication_date = fmap (DT.pack . show) utcTime
144 , _hd_publication_year = pub_year
145 , _hd_publication_month = pub_month
146 , _hd_publication_day = pub_day
147 , _hd_publication_hour = Nothing
148 , _hd_publication_minute = Nothing
149 , _hd_publication_second = Nothing
150 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
151
152 enrichWith :: FileFormat
153 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
154 enrichWith RisPresse = enrichWith' presseEnrich
155 enrichWith WOS = enrichWith' (map (first WOS.keys))
156 enrichWith _ = enrichWith' identity
157
158
159 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
160 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
161 enrichWith' f = second (map both' . map f . concat)
162 where
163 both' = map (both decodeUtf8)
164
165
166
167 readFileWith :: FileFormat -> FilePath
168 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
169 readFileWith format path = do
170 files <- case takeExtension path of
171 ".zip" -> openZip path
172 _ -> pure <$> clean <$> DB.readFile path
173 partitionEithers <$> mapConcurrently (runParser format) files
174
175
176 -- | withParser:
177 -- According to the format of the text, choose the right parser.
178 -- TODO withParser :: FileFormat -> Parser [Document]
179 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
180 withParser WOS = WOS.parser
181 withParser RIS = RIS.parser
182 --withParser ODT = odtParser
183 --withParser XML = xmlParser
184 withParser _ = panic "[ERROR] Parser not implemented yet"
185
186 runParser :: FileFormat -> DB.ByteString
187 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
188 runParser format text = pure $ runParser' format text
189
190 runParser' :: FileFormat -> DB.ByteString
191 -> (Either String [[(DB.ByteString, DB.ByteString)]])
192 runParser' format text = parseOnly (withParser format) text
193
194 openZip :: FilePath -> IO [DB.ByteString]
195 openZip fp = do
196 entries <- withArchive fp (DM.keys <$> getEntries)
197 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
198 pure bs
199
200 cleanText :: Text -> Text
201 cleanText = cs . clean . cs
202
203 clean :: DB.ByteString -> DB.ByteString
204 clean txt = DBC.map clean' txt
205 where
206 clean' '’' = '\''
207 clean' '\r' = ' '
208 clean' '\t' = ' '
209 clean' ';' = '.'
210 clean' c = c