]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[FEAT] Iramuteq parser (WIP)
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
24 where
25
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Conduit
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
36 import Data.Ord()
37 import Data.String (String())
38 import Data.String()
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
47 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49 import System.FilePath (FilePath(), takeExtension)
50 import System.IO.Temp (emptySystemTempFile)
51 import qualified Data.ByteString as DB
52 import qualified Data.ByteString.Char8 as DBC
53 import qualified Data.ByteString.Lazy as DBL
54 import qualified Data.Map as DM
55 import qualified Data.Text as DT
56 import qualified Data.Text as Text
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
59 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
60 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
61 import qualified Prelude
62 ------------------------------------------------------------------------
63
64 type ParseError = String
65 --type Field = Text
66 --type Document = DM.Map Field Text
67 --type FilesParsed = DM.Map FilePath FileParsed
68 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
69 -- , _fileParsed_result :: [Document]
70 -- } deriving (Show)
71
72
73 -- | According to the format of Input file,
74 -- different parser are available.
75 data FileType = WOS
76 | RIS
77 | RisPresse
78 | CsvGargV3
79 | CsvHal
80 | Iramuteq
81 deriving (Show)
82
83 -- Implemented (ISI Format)
84 -- | DOC -- Not Implemented / import Pandoc
85 -- | ODT -- Not Implemented / import Pandoc
86 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
87 -- | XML -- Not Implemented / see :
88
89 parseFormatC :: MonadBaseControl IO m
90 => FileType
91 -> FileFormat
92 -> DB.ByteString
93 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
94 parseFormatC CsvGargV3 Plain bs = do
95 let eParsedC = parseCsvC $ DBL.fromStrict bs
96 case eParsedC of
97 Left err -> pure $ Left err
98 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
99 parseFormatC CsvHal Plain bs = do
100 let eParsedC = parseCsvC $ DBL.fromStrict bs
101 case eParsedC of
102 Left err -> pure $ Left err
103 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
104 parseFormatC RisPresse Plain bs = do
105 --docs <- enrichWith RisPresse
106 let eDocs = runParser' RisPresse bs
107 pure $ (\docs ->
108 ( Just $ fromIntegral $ length docs
109 , yieldMany docs
110 .| mapC presseEnrich
111 .| mapC (map $ both decodeUtf8)
112 .| mapMC (toDoc RIS)) ) <$> eDocs
113 parseFormatC WOS Plain bs = do
114 let eDocs = runParser' WOS bs
115 pure $ (\docs ->
116 ( Just $ fromIntegral $ length docs
117 , yieldMany docs
118 .| mapC (map $ first WOS.keys)
119 .| mapC (map $ both decodeUtf8)
120 .| mapMC (toDoc WOS)) ) <$> eDocs
121
122 parseFormatC Iramuteq Plain bs = do
123 let eDocs = runParser' Iramuteq bs
124 pure $ (\docs ->
125 ( Just $ fromIntegral $ length docs
126 , yieldMany docs
127 .| mapC (map $ first Iramuteq.keys)
128 .| mapC (map $ both decodeUtf8)
129 .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " ")))) ))<$> eDocs
130
131 parseFormatC ft ZIP bs = do
132 path <- liftBase $ emptySystemTempFile "parsed-zip"
133 liftBase $ DB.writeFile path bs
134 fileContents <- liftBase $ withArchive path $ do
135 files <- DM.keys <$> getEntries
136 mapM getEntry files
137 --printDebug "[parseFormatC] fileContents" fileContents
138 eContents <- mapM (parseFormatC ft Plain) fileContents
139 --printDebug "[parseFormatC] contents" contents
140 --pure $ Left $ "Not implemented for ZIP"
141 let (errs, contents) = partitionEithers eContents
142 case errs of
143 [] ->
144 case contents of
145 [] -> pure $ Left "No files in zip"
146 _ -> do
147 let lenghts = fst <$> contents
148 let contents' = snd <$> contents
149 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
150 pure $ Right ( Just totalLength
151 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
152 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
153
154 parseFormatC _ _ _ = undefined
155
156 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
157 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
158 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
159 -- parseFormat RisPresse bs = do
160 -- docs <- mapM (toDoc RIS)
161 -- <$> snd
162 -- <$> enrichWith RisPresse
163 -- $ partitionEithers
164 -- $ [runParser' RisPresse bs]
165 -- pure $ Right docs
166 -- parseFormat WOS bs = do
167 -- docs <- mapM (toDoc WOS)
168 -- <$> snd
169 -- <$> enrichWith WOS
170 -- $ partitionEithers
171 -- $ [runParser' WOS bs]
172 -- pure $ Right docs
173 -- parseFormat ZIP bs = do
174 -- path <- emptySystemTempFile "parsed-zip"
175 -- DB.writeFile path bs
176 -- parsedZip <- withArchive path $ do
177 -- DM.keys <$> getEntries
178 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
179 -- parseFormat _ _ = undefined
180
181 -- | Parse file into documents
182 -- TODO manage errors here
183 -- TODO: to debug maybe add the filepath in error message
184
185 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
186 parseFile CsvHal Plain p = parseHal p
187 parseFile CsvGargV3 Plain p = parseCsv p
188
189 parseFile RisPresse Plain p = do
190 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
191 pure $ Right docs
192
193 parseFile WOS Plain p = do
194 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
195 pure $ Right docs
196
197 parseFile Iramuteq Plain p = do
198 docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
199 <$> snd
200 <$> enrichWith Iramuteq
201 <$> readFileWith Iramuteq p
202 pure $ Right docs
203
204
205 parseFile ff _ p = do
206 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
207 pure $ Right docs
208
209 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
210 -- TODO use language for RIS
211 toDoc ff d = do
212 -- let abstract = lookup "abstract" d
213 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
214
215 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
216 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
217 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
218
219 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
220 , _hd_doi = lookup "doi" d
221 , _hd_url = lookup "URL" d
222 , _hd_uniqId = Nothing
223 , _hd_uniqIdBdd = Nothing
224 , _hd_page = Nothing
225 , _hd_title = lookup "title" d
226 , _hd_authors = lookup "authors" d
227 , _hd_institutes = lookup "institutes" d
228 , _hd_source = lookup "source" d
229 , _hd_abstract = lookup "abstract" d
230 , _hd_publication_date = fmap (DT.pack . show) utcTime
231 , _hd_publication_year = pub_year
232 , _hd_publication_month = pub_month
233 , _hd_publication_day = pub_day
234 , _hd_publication_hour = Nothing
235 , _hd_publication_minute = Nothing
236 , _hd_publication_second = Nothing
237 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
238 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
239 pure hd
240
241 enrichWith :: FileType
242 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
243 enrichWith RisPresse = enrichWith' presseEnrich
244 enrichWith WOS = enrichWith' (map (first WOS.keys))
245 enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
246 enrichWith _ = enrichWith' identity
247
248
249 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
250 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
251 enrichWith' f = second (map both' . map f . concat)
252 where
253 both' = map (both decodeUtf8)
254
255
256
257 readFileWith :: FileType -> FilePath
258 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
259 readFileWith format path = do
260 files <- case takeExtension path of
261 ".zip" -> openZip path
262 _ -> pure <$> clean <$> DB.readFile path
263 partitionEithers <$> mapConcurrently (runParser format) files
264
265
266 -- | withParser:
267 -- According to the format of the text, choose the right parser.
268 -- TODO withParser :: FileType -> Parser [Document]
269 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
270 withParser WOS = WOS.parser
271 withParser RIS = RIS.parser
272 withParser Iramuteq = Iramuteq.parser
273 --withParser ODT = odtParser
274 --withParser XML = xmlParser
275 withParser _ = panic "[ERROR] Parser not implemented yet"
276
277 runParser :: FileType -> DB.ByteString
278 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
279 runParser format text = pure $ runParser' format text
280
281 runParser' :: FileType -> DB.ByteString
282 -> (Either String [[(DB.ByteString, DB.ByteString)]])
283 runParser' format text = parseOnly (withParser format) text
284
285 openZip :: FilePath -> IO [DB.ByteString]
286 openZip fp = do
287 entries <- withArchive fp (DM.keys <$> getEntries)
288 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
289 pure bs
290
291 cleanText :: Text -> Text
292 cleanText = cs . clean . cs
293
294 clean :: DB.ByteString -> DB.ByteString
295 clean txt = DBC.map clean' txt
296 where
297 clean' '’' = '\''
298 clean' '\r' = ' '
299 clean' '\t' = ' '
300 clean' ';' = '.'
301 clean' c = c
302
303 --
304
305 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
306 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
307 splitOn _ _ = (DT.splitOn ", ")
308