]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
Merge branch 'dev-userid-in-auth' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Conduit
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Trans.Control (MonadBaseControl)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
35 import Data.Ord()
36 import Data.String (String())
37 import Data.String()
38 import Data.Text (Text, intercalate, pack, unpack)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude
48 import System.IO.Temp (emptySystemTempFile)
49
50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
53 import Gargantext.Prelude
54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
60 ------------------------------------------------------------------------
61
62 type ParseError = String
63 --type Field = Text
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
68 -- } deriving (Show)
69
70
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81
82 parseFormatC :: MonadBaseControl IO m => FileType -> FileFormat -> DB.ByteString -> m (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
83 parseFormatC CsvGargV3 Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
84 parseFormatC CsvHal Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
85 parseFormatC RisPresse Plain bs = do
86 --docs <- enrichWith RisPresse
87 let eDocs = runParser' RisPresse bs
88 pure $ (\docs -> yieldMany docs
89 .| mapC presseEnrich
90 .| mapC (map $ both decodeUtf8)
91 .| mapMC (toDoc RIS)) <$> eDocs
92 parseFormatC WOS Plain bs = do
93 let eDocs = runParser' WOS bs
94 pure $ (\docs -> yieldMany docs
95 .| mapC (map $ first WOS.keys)
96 .| mapC (map $ both decodeUtf8)
97 .| mapMC (toDoc WOS)) <$> eDocs
98 parseFormatC ft ZIP bs = do
99 path <- liftBase $ emptySystemTempFile "parsed-zip"
100 liftBase $ DB.writeFile path bs
101 fileContents <- liftBase $ withArchive path $ do
102 files <- DM.keys <$> getEntries
103 mapM getEntry files
104 --printDebug "[parseFormatC] fileContents" fileContents
105 eContents <- mapM (parseFormatC ft Plain) fileContents
106 --printDebug "[parseFormatC] contents" contents
107 --pure $ Left $ "Not implemented for ZIP"
108 let (errs, contents) = partitionEithers eContents
109 case errs of
110 [] ->
111 case contents of
112 [] -> pure $ Left "No files in zip"
113 _ -> pure $ Right $ ( sequenceConduits contents >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
114 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
115
116 parseFormatC _ _ _ = undefined
117
118 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
119 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
120 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
121 -- parseFormat RisPresse bs = do
122 -- docs <- mapM (toDoc RIS)
123 -- <$> snd
124 -- <$> enrichWith RisPresse
125 -- $ partitionEithers
126 -- $ [runParser' RisPresse bs]
127 -- pure $ Right docs
128 -- parseFormat WOS bs = do
129 -- docs <- mapM (toDoc WOS)
130 -- <$> snd
131 -- <$> enrichWith WOS
132 -- $ partitionEithers
133 -- $ [runParser' WOS bs]
134 -- pure $ Right docs
135 -- parseFormat ZIP bs = do
136 -- path <- emptySystemTempFile "parsed-zip"
137 -- DB.writeFile path bs
138 -- parsedZip <- withArchive path $ do
139 -- DM.keys <$> getEntries
140 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
141 -- parseFormat _ _ = undefined
142
143 -- | Parse file into documents
144 -- TODO manage errors here
145 -- TODO: to debug maybe add the filepath in error message
146 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
147 parseFile CsvHal Plain p = parseHal p
148 parseFile CsvGargV3 Plain p = parseCsv p
149 parseFile RisPresse Plain p = do
150 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
151 pure $ Right docs
152 parseFile WOS Plain p = do
153 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
154 pure $ Right docs
155 parseFile ff _ p = do
156 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
157 pure $ Right docs
158
159 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
160 -- TODO use language for RIS
161 toDoc ff d = do
162 -- let abstract = lookup "abstract" d
163 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
164
165 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
166
167 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
168
169 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
170 , _hd_doi = lookup "doi" d
171 , _hd_url = lookup "URL" d
172 , _hd_uniqId = Nothing
173 , _hd_uniqIdBdd = Nothing
174 , _hd_page = Nothing
175 , _hd_title = lookup "title" d
176 , _hd_authors = Nothing
177 , _hd_institutes = lookup "authors" d
178 , _hd_source = lookup "source" d
179 , _hd_abstract = lookup "abstract" d
180 , _hd_publication_date = fmap (DT.pack . show) utcTime
181 , _hd_publication_year = pub_year
182 , _hd_publication_month = pub_month
183 , _hd_publication_day = pub_day
184 , _hd_publication_hour = Nothing
185 , _hd_publication_minute = Nothing
186 , _hd_publication_second = Nothing
187 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
188
189 enrichWith :: FileType
190 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
191 enrichWith RisPresse = enrichWith' presseEnrich
192 enrichWith WOS = enrichWith' (map (first WOS.keys))
193 enrichWith _ = enrichWith' identity
194
195
196 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
197 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
198 enrichWith' f = second (map both' . map f . concat)
199 where
200 both' = map (both decodeUtf8)
201
202
203
204 readFileWith :: FileType -> FilePath
205 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
206 readFileWith format path = do
207 files <- case takeExtension path of
208 ".zip" -> openZip path
209 _ -> pure <$> clean <$> DB.readFile path
210 partitionEithers <$> mapConcurrently (runParser format) files
211
212
213 -- | withParser:
214 -- According to the format of the text, choose the right parser.
215 -- TODO withParser :: FileType -> Parser [Document]
216 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
217 withParser WOS = WOS.parser
218 withParser RIS = RIS.parser
219 --withParser ODT = odtParser
220 --withParser XML = xmlParser
221 withParser _ = panic "[ERROR] Parser not implemented yet"
222
223 runParser :: FileType -> DB.ByteString
224 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
225 runParser format text = pure $ runParser' format text
226
227 runParser' :: FileType -> DB.ByteString
228 -> (Either String [[(DB.ByteString, DB.ByteString)]])
229 runParser' format text = parseOnly (withParser format) text
230
231 openZip :: FilePath -> IO [DB.ByteString]
232 openZip fp = do
233 entries <- withArchive fp (DM.keys <$> getEntries)
234 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
235 pure bs
236
237 cleanText :: Text -> Text
238 cleanText = cs . clean . cs
239
240 clean :: DB.ByteString -> DB.ByteString
241 clean txt = DBC.map clean' txt
242 where
243 clean' '’' = '\''
244 clean' '\r' = ' '
245 clean' '\t' = ' '
246 clean' ';' = '.'
247 clean' c = c