]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
Merge branch '104-dev-john-snow-nlp' of ssh://gitlab.iscpif.fr:20022/cgenie/haskell...
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Conduit
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Trans.Control (MonadBaseControl)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
35 import Data.Ord()
36 import Data.String (String())
37 import Data.String()
38 import Data.Text (Text, intercalate, pack, unpack)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude
48 import System.IO.Temp (emptySystemTempFile)
49
50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
53 import Gargantext.Prelude
54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
60 ------------------------------------------------------------------------
61
62 type ParseError = String
63 --type Field = Text
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
68 -- } deriving (Show)
69
70
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81
82 parseFormatC :: MonadBaseControl IO m
83 => FileType
84 -> FileFormat
85 -> DB.ByteString
86 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
87 parseFormatC CsvGargV3 Plain bs = do
88 let eParsedC = parseCsvC $ DBL.fromStrict bs
89 case eParsedC of
90 Left err -> pure $ Left err
91 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
92 parseFormatC CsvHal Plain bs = do
93 let eParsedC = parseCsvC $ DBL.fromStrict bs
94 case eParsedC of
95 Left err -> pure $ Left err
96 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
97 parseFormatC RisPresse Plain bs = do
98 --docs <- enrichWith RisPresse
99 let eDocs = runParser' RisPresse bs
100 pure $ (\docs ->
101 ( Just $ fromIntegral $ length docs
102 , yieldMany docs
103 .| mapC presseEnrich
104 .| mapC (map $ both decodeUtf8)
105 .| mapMC (toDoc RIS)) ) <$> eDocs
106 parseFormatC WOS Plain bs = do
107 let eDocs = runParser' WOS bs
108 pure $ (\docs ->
109 ( Just $ fromIntegral $ length docs
110 , yieldMany docs
111 .| mapC (map $ first WOS.keys)
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc WOS)) ) <$> eDocs
114 parseFormatC ft ZIP bs = do
115 path <- liftBase $ emptySystemTempFile "parsed-zip"
116 liftBase $ DB.writeFile path bs
117 fileContents <- liftBase $ withArchive path $ do
118 files <- DM.keys <$> getEntries
119 mapM getEntry files
120 --printDebug "[parseFormatC] fileContents" fileContents
121 eContents <- mapM (parseFormatC ft Plain) fileContents
122 --printDebug "[parseFormatC] contents" contents
123 --pure $ Left $ "Not implemented for ZIP"
124 let (errs, contents) = partitionEithers eContents
125 case errs of
126 [] ->
127 case contents of
128 [] -> pure $ Left "No files in zip"
129 _ -> do
130 let lenghts = fst <$> contents
131 let contents' = snd <$> contents
132 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
133 pure $ Right ( Just totalLength
134 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
135 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
136
137 parseFormatC _ _ _ = undefined
138
139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
141 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
142 -- parseFormat RisPresse bs = do
143 -- docs <- mapM (toDoc RIS)
144 -- <$> snd
145 -- <$> enrichWith RisPresse
146 -- $ partitionEithers
147 -- $ [runParser' RisPresse bs]
148 -- pure $ Right docs
149 -- parseFormat WOS bs = do
150 -- docs <- mapM (toDoc WOS)
151 -- <$> snd
152 -- <$> enrichWith WOS
153 -- $ partitionEithers
154 -- $ [runParser' WOS bs]
155 -- pure $ Right docs
156 -- parseFormat ZIP bs = do
157 -- path <- emptySystemTempFile "parsed-zip"
158 -- DB.writeFile path bs
159 -- parsedZip <- withArchive path $ do
160 -- DM.keys <$> getEntries
161 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
162 -- parseFormat _ _ = undefined
163
164 -- | Parse file into documents
165 -- TODO manage errors here
166 -- TODO: to debug maybe add the filepath in error message
167 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
168 parseFile CsvHal Plain p = parseHal p
169 parseFile CsvGargV3 Plain p = parseCsv p
170 parseFile RisPresse Plain p = do
171 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
172 pure $ Right docs
173 parseFile WOS Plain p = do
174 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
175 pure $ Right docs
176 parseFile ff _ p = do
177 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
178 pure $ Right docs
179
180 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
181 -- TODO use language for RIS
182 toDoc ff d = do
183 -- let abstract = lookup "abstract" d
184 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
185
186 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
187
188 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
189
190 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
191 , _hd_doi = lookup "doi" d
192 , _hd_url = lookup "URL" d
193 , _hd_uniqId = Nothing
194 , _hd_uniqIdBdd = Nothing
195 , _hd_page = Nothing
196 , _hd_title = lookup "title" d
197 , _hd_authors = Nothing
198 , _hd_institutes = lookup "authors" d
199 , _hd_source = lookup "source" d
200 , _hd_abstract = lookup "abstract" d
201 , _hd_publication_date = fmap (DT.pack . show) utcTime
202 , _hd_publication_year = pub_year
203 , _hd_publication_month = pub_month
204 , _hd_publication_day = pub_day
205 , _hd_publication_hour = Nothing
206 , _hd_publication_minute = Nothing
207 , _hd_publication_second = Nothing
208 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
209
210 enrichWith :: FileType
211 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
212 enrichWith RisPresse = enrichWith' presseEnrich
213 enrichWith WOS = enrichWith' (map (first WOS.keys))
214 enrichWith _ = enrichWith' identity
215
216
217 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
218 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
219 enrichWith' f = second (map both' . map f . concat)
220 where
221 both' = map (both decodeUtf8)
222
223
224
225 readFileWith :: FileType -> FilePath
226 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
227 readFileWith format path = do
228 files <- case takeExtension path of
229 ".zip" -> openZip path
230 _ -> pure <$> clean <$> DB.readFile path
231 partitionEithers <$> mapConcurrently (runParser format) files
232
233
234 -- | withParser:
235 -- According to the format of the text, choose the right parser.
236 -- TODO withParser :: FileType -> Parser [Document]
237 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
238 withParser WOS = WOS.parser
239 withParser RIS = RIS.parser
240 --withParser ODT = odtParser
241 --withParser XML = xmlParser
242 withParser _ = panic "[ERROR] Parser not implemented yet"
243
244 runParser :: FileType -> DB.ByteString
245 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
246 runParser format text = pure $ runParser' format text
247
248 runParser' :: FileType -> DB.ByteString
249 -> (Either String [[(DB.ByteString, DB.ByteString)]])
250 runParser' format text = parseOnly (withParser format) text
251
252 openZip :: FilePath -> IO [DB.ByteString]
253 openZip fp = do
254 entries <- withArchive fp (DM.keys <$> getEntries)
255 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
256 pure bs
257
258 cleanText :: Text -> Text
259 cleanText = cs . clean . cs
260
261 clean :: DB.ByteString -> DB.ByteString
262 clean txt = DBC.map clean' txt
263 where
264 clean' '’' = '\''
265 clean' '\r' = ' '
266 clean' '\t' = ' '
267 clean' ';' = '.'
268 clean' c = c