]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[conduit] some more work on conduit in file parser (still doesn't compile)
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Conduit
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Identity (runIdentity)
30 import Data.Attoparsec.ByteString (parseOnly, Parser)
31 import Control.Monad (join)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
35 import Data.Ord()
36 import Data.String (String())
37 import Data.String()
38 import Data.Text (Text)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude as Prelude
48 import System.IO.Temp (emptySystemTempFile)
49
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
52 import Gargantext.Prelude
53 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC)
54 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
55 -- import Gargantext.Core.Text.Learn (detectLangDefault)
56 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
57 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
58 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
59 ------------------------------------------------------------------------
60
61 type ParseError = String
62 --type Field = Text
63 --type Document = DM.Map Field Text
64 --type FilesParsed = DM.Map FilePath FileParsed
65 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
66 -- , _fileParsed_result :: [Document]
67 -- } deriving (Show)
68
69
70 -- | According to the format of Input file,
71 -- different parser are available.
72 data FileFormat = WOS | RIS | RisPresse
73 | CsvGargV3 | CsvHal
74 | ZIP
75 deriving (Show)
76
77 -- Implemented (ISI Format)
78 -- | DOC -- Not Implemented / import Pandoc
79 -- | ODT -- Not Implemented / import Pandoc
80 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
81 -- | XML -- Not Implemented / see :
82
83 parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
84 parseFormatC CsvGargV3 bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
85 parseFormatC CsvHal bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
86 parseFormatC RisPresse bs = do
87 docs <- snd
88 <$> enrichWith RisPresse
89 $ partitionEithers
90 $ [runParser' RisPresse bs]
91 pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc RIS)) <$> docs
92 parseFormatC WOS bs = do
93 docs <- snd
94 <$> enrichWith WOS
95 $ partitionEithers
96 $ [runParser' WOS bs]
97 pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc WOS)) <$> docs
98 parseFormatC ZIP bs = do
99 path <- emptySystemTempFile "parsed-zip"
100 DB.writeFile path bs
101 parsedZip <- withArchive path $ do
102 DM.keys <$> getEntries
103 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
104 parseFormatC _ _ = undefined
105
106 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
107 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
108 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
109 parseFormat RisPresse bs = do
110 docs <- mapM (toDoc RIS)
111 <$> snd
112 <$> enrichWith RisPresse
113 $ partitionEithers
114 $ [runParser' RisPresse bs]
115 pure $ Right docs
116 parseFormat WOS bs = do
117 docs <- mapM (toDoc WOS)
118 <$> snd
119 <$> enrichWith WOS
120 $ partitionEithers
121 $ [runParser' WOS bs]
122 pure $ Right docs
123 parseFormat ZIP bs = do
124 path <- emptySystemTempFile "parsed-zip"
125 DB.writeFile path bs
126 parsedZip <- withArchive path $ do
127 DM.keys <$> getEntries
128 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
129 parseFormat _ _ = undefined
130
131 -- | Parse file into documents
132 -- TODO manage errors here
133 -- TODO: to debug maybe add the filepath in error message
134 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
135 parseFile CsvHal p = parseHal p
136 parseFile CsvGargV3 p = parseCsv p
137 parseFile RisPresse p = do
138 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
139 pure $ Right docs
140 parseFile WOS p = do
141 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
142 pure $ Right docs
143 parseFile ff p = do
144 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
145 pure $ Right docs
146
147 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
148 -- TODO use language for RIS
149 toDoc ff d = do
150 -- let abstract = lookup "abstract" d
151 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
152
153 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
154
155 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
156
157 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
158 , _hd_doi = lookup "doi" d
159 , _hd_url = lookup "URL" d
160 , _hd_uniqId = Nothing
161 , _hd_uniqIdBdd = Nothing
162 , _hd_page = Nothing
163 , _hd_title = lookup "title" d
164 , _hd_authors = Nothing
165 , _hd_institutes = lookup "authors" d
166 , _hd_source = lookup "source" d
167 , _hd_abstract = lookup "abstract" d
168 , _hd_publication_date = fmap (DT.pack . show) utcTime
169 , _hd_publication_year = pub_year
170 , _hd_publication_month = pub_month
171 , _hd_publication_day = pub_day
172 , _hd_publication_hour = Nothing
173 , _hd_publication_minute = Nothing
174 , _hd_publication_second = Nothing
175 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
176
177 enrichWith :: FileFormat
178 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
179 enrichWith RisPresse = enrichWith' presseEnrich
180 enrichWith WOS = enrichWith' (map (first WOS.keys))
181 enrichWith _ = enrichWith' identity
182
183
184 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
185 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
186 enrichWith' f = second (map both' . map f . concat)
187 where
188 both' = map (both decodeUtf8)
189
190
191
192 readFileWith :: FileFormat -> FilePath
193 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
194 readFileWith format path = do
195 files <- case takeExtension path of
196 ".zip" -> openZip path
197 _ -> pure <$> clean <$> DB.readFile path
198 partitionEithers <$> mapConcurrently (runParser format) files
199
200
201 -- | withParser:
202 -- According to the format of the text, choose the right parser.
203 -- TODO withParser :: FileFormat -> Parser [Document]
204 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
205 withParser WOS = WOS.parser
206 withParser RIS = RIS.parser
207 --withParser ODT = odtParser
208 --withParser XML = xmlParser
209 withParser _ = panic "[ERROR] Parser not implemented yet"
210
211 runParser :: FileFormat -> DB.ByteString
212 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
213 runParser format text = pure $ runParser' format text
214
215 runParser' :: FileFormat -> DB.ByteString
216 -> (Either String [[(DB.ByteString, DB.ByteString)]])
217 runParser' format text = parseOnly (withParser format) text
218
219 openZip :: FilePath -> IO [DB.ByteString]
220 openZip fp = do
221 entries <- withArchive fp (DM.keys <$> getEntries)
222 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
223 pure bs
224
225 cleanText :: Text -> Text
226 cleanText = cs . clean . cs
227
228 clean :: DB.ByteString -> DB.ByteString
229 clean txt = DBC.map clean' txt
230 where
231 clean' '’' = '\''
232 clean' '\r' = ' '
233 clean' '\t' = ' '
234 clean' ';' = '.'
235 clean' c = c