]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[FIX] Order 2 regression and split of clustering
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
24 where
25
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Conduit
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
36 import Data.Ord()
37 import Data.String (String())
38 import Data.String()
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
47 import Gargantext.Prelude
48 import System.FilePath (FilePath(), takeExtension)
49 import System.IO.Temp (emptySystemTempFile)
50 import qualified Data.ByteString as DB
51 import qualified Data.ByteString.Char8 as DBC
52 import qualified Data.ByteString.Lazy as DBL
53 import qualified Data.Map as DM
54 import qualified Data.Text as DT
55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
58 import qualified Prelude
59 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
60 ------------------------------------------------------------------------
61
62 type ParseError = String
63 --type Field = Text
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
68 -- } deriving (Show)
69
70
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81
82 parseFormatC :: MonadBaseControl IO m
83 => FileType
84 -> FileFormat
85 -> DB.ByteString
86 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
87 parseFormatC CsvGargV3 Plain bs = do
88 let eParsedC = parseCsvC $ DBL.fromStrict bs
89 case eParsedC of
90 Left err -> pure $ Left err
91 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
92 parseFormatC CsvHal Plain bs = do
93 let eParsedC = parseCsvC $ DBL.fromStrict bs
94 case eParsedC of
95 Left err -> pure $ Left err
96 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
97 parseFormatC RisPresse Plain bs = do
98 --docs <- enrichWith RisPresse
99 let eDocs = runParser' RisPresse bs
100 pure $ (\docs ->
101 ( Just $ fromIntegral $ length docs
102 , yieldMany docs
103 .| mapC presseEnrich
104 .| mapC (map $ both decodeUtf8)
105 .| mapMC (toDoc RIS)) ) <$> eDocs
106 parseFormatC WOS Plain bs = do
107 let eDocs = runParser' WOS bs
108 pure $ (\docs ->
109 ( Just $ fromIntegral $ length docs
110 , yieldMany docs
111 .| mapC (map $ first WOS.keys)
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc WOS)) ) <$> eDocs
114 parseFormatC ft ZIP bs = do
115 path <- liftBase $ emptySystemTempFile "parsed-zip"
116 liftBase $ DB.writeFile path bs
117 fileContents <- liftBase $ withArchive path $ do
118 files <- DM.keys <$> getEntries
119 mapM getEntry files
120 --printDebug "[parseFormatC] fileContents" fileContents
121 eContents <- mapM (parseFormatC ft Plain) fileContents
122 --printDebug "[parseFormatC] contents" contents
123 --pure $ Left $ "Not implemented for ZIP"
124 let (errs, contents) = partitionEithers eContents
125 case errs of
126 [] ->
127 case contents of
128 [] -> pure $ Left "No files in zip"
129 _ -> do
130 let lenghts = fst <$> contents
131 let contents' = snd <$> contents
132 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
133 pure $ Right ( Just totalLength
134 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
135 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
136
137 parseFormatC _ _ _ = undefined
138
139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
141 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
142 -- parseFormat RisPresse bs = do
143 -- docs <- mapM (toDoc RIS)
144 -- <$> snd
145 -- <$> enrichWith RisPresse
146 -- $ partitionEithers
147 -- $ [runParser' RisPresse bs]
148 -- pure $ Right docs
149 -- parseFormat WOS bs = do
150 -- docs <- mapM (toDoc WOS)
151 -- <$> snd
152 -- <$> enrichWith WOS
153 -- $ partitionEithers
154 -- $ [runParser' WOS bs]
155 -- pure $ Right docs
156 -- parseFormat ZIP bs = do
157 -- path <- emptySystemTempFile "parsed-zip"
158 -- DB.writeFile path bs
159 -- parsedZip <- withArchive path $ do
160 -- DM.keys <$> getEntries
161 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
162 -- parseFormat _ _ = undefined
163
164 -- | Parse file into documents
165 -- TODO manage errors here
166 -- TODO: to debug maybe add the filepath in error message
167
168 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
169 parseFile CsvHal Plain p = parseHal p
170 parseFile CsvGargV3 Plain p = parseCsv p
171
172 parseFile RisPresse Plain p = do
173 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
174 pure $ Right docs
175
176 parseFile WOS Plain p = do
177 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
178 pure $ Right docs
179
180 parseFile ff _ p = do
181 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
182 pure $ Right docs
183
184 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
185 -- TODO use language for RIS
186 toDoc ff d = do
187 -- let abstract = lookup "abstract" d
188 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
189
190 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
191 printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
192 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
193
194 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
195 , _hd_doi = lookup "doi" d
196 , _hd_url = lookup "URL" d
197 , _hd_uniqId = Nothing
198 , _hd_uniqIdBdd = Nothing
199 , _hd_page = Nothing
200 , _hd_title = lookup "title" d
201 , _hd_authors = lookup "authors" d
202 , _hd_institutes = lookup "institutes" d
203 , _hd_source = lookup "source" d
204 , _hd_abstract = lookup "abstract" d
205 , _hd_publication_date = fmap (DT.pack . show) utcTime
206 , _hd_publication_year = pub_year
207 , _hd_publication_month = pub_month
208 , _hd_publication_day = pub_day
209 , _hd_publication_hour = Nothing
210 , _hd_publication_minute = Nothing
211 , _hd_publication_second = Nothing
212 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
213 printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
214 pure hd
215
216 enrichWith :: FileType
217 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
218 enrichWith RisPresse = enrichWith' presseEnrich
219 enrichWith WOS = enrichWith' (map (first WOS.keys))
220 enrichWith _ = enrichWith' identity
221
222
223 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
224 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
225 enrichWith' f = second (map both' . map f . concat)
226 where
227 both' = map (both decodeUtf8)
228
229
230
231 readFileWith :: FileType -> FilePath
232 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
233 readFileWith format path = do
234 files <- case takeExtension path of
235 ".zip" -> openZip path
236 _ -> pure <$> clean <$> DB.readFile path
237 partitionEithers <$> mapConcurrently (runParser format) files
238
239
240 -- | withParser:
241 -- According to the format of the text, choose the right parser.
242 -- TODO withParser :: FileType -> Parser [Document]
243 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
244 withParser WOS = WOS.parser
245 withParser RIS = RIS.parser
246 --withParser ODT = odtParser
247 --withParser XML = xmlParser
248 withParser _ = panic "[ERROR] Parser not implemented yet"
249
250 runParser :: FileType -> DB.ByteString
251 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
252 runParser format text = pure $ runParser' format text
253
254 runParser' :: FileType -> DB.ByteString
255 -> (Either String [[(DB.ByteString, DB.ByteString)]])
256 runParser' format text = parseOnly (withParser format) text
257
258 openZip :: FilePath -> IO [DB.ByteString]
259 openZip fp = do
260 entries <- withArchive fp (DM.keys <$> getEntries)
261 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
262 pure bs
263
264 cleanText :: Text -> Text
265 cleanText = cs . clean . cs
266
267 clean :: DB.ByteString -> DB.ByteString
268 clean txt = DBC.map clean' txt
269 where
270 clean' '’' = '\''
271 clean' '\r' = ' '
272 clean' '\t' = ' '
273 clean' ';' = '.'
274 clean' c = c
275
276 --
277
278 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
279 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
280 splitOn _ _ = (DT.splitOn ", ")
281