]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
rebase
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join, sequence)
29 import Control.Monad.IO.Class (liftIO)
30 import Data.Attoparsec.ByteString (parseOnly, Parser)
31 import Data.Either(Either(..))
32 import Data.Either.Extra (partitionEithers)
33 import Data.List (concat, lookup)
34 import Data.Ord()
35 import Data.String (String())
36 import Data.String()
37 import Data.Text (Text)
38 import Data.Text.Encoding (decodeUtf8)
39 import Data.Tuple.Extra (both, first, second)
40 import System.FilePath (FilePath(), takeExtension)
41 import qualified Data.ByteString as DB
42 import qualified Data.ByteString.Char8 as DBC
43 import qualified Data.ByteString.Lazy as DBL
44 import qualified Data.Map as DM
45 import qualified Data.Text as DT
46 import qualified Prelude as Prelude
47 import System.IO.Temp (emptySystemTempFile)
48
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
51 import Gargantext.Prelude
52 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
53 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
54 -- import Gargantext.Core.Text.Learn (detectLangDefault)
55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
58 ------------------------------------------------------------------------
59
60 type ParseError = String
61 --type Field = Text
62 --type Document = DM.Map Field Text
63 --type FilesParsed = DM.Map FilePath FileParsed
64 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
65 -- , _fileParsed_result :: [Document]
66 -- } deriving (Show)
67
68
69 -- | According to the format of Input file,
70 -- different parser are available.
71 data FileFormat = WOS | RIS | RisPresse
72 | CsvGargV3 | CsvHal
73 | ZIP
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81
82
83 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
84 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
85 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
86 parseFormat RisPresse bs = do
87 docs <- mapM (toDoc RIS)
88 <$> snd
89 <$> enrichWith RisPresse
90 $ partitionEithers
91 $ [runParser' RisPresse bs]
92 pure $ Right docs
93 parseFormat WOS bs = do
94 docs <- mapM (toDoc WOS)
95 <$> snd
96 <$> enrichWith WOS
97 $ partitionEithers
98 $ [runParser' WOS bs]
99 pure $ Right docs
100 parseFormat ZIP bs = do
101 path <- emptySystemTempFile "parsed-zip"
102 DB.writeFile path bs
103 parsedZip <- withArchive path $ do
104 DM.keys <$> getEntries
105 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
106 parseFormat _ _ = undefined
107
108 -- | Parse file into documents
109 -- TODO manage errors here
110 -- TODO: to debug maybe add the filepath in error message
111 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
112 parseFile CsvHal p = parseHal p
113 parseFile CsvGargV3 p = parseCsv p
114 parseFile RisPresse p = do
115 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
116 pure $ Right docs
117 parseFile WOS p = do
118 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
119 pure $ Right docs
120 parseFile ff p = do
121 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
122 pure $ Right docs
123
124 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
125 -- TODO use language for RIS
126 toDoc ff d = do
127 -- let abstract = lookup "abstract" d
128 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
129
130 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
131
132 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
133
134 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
135 , _hd_doi = lookup "doi" d
136 , _hd_url = lookup "URL" d
137 , _hd_uniqId = Nothing
138 , _hd_uniqIdBdd = Nothing
139 , _hd_page = Nothing
140 , _hd_title = lookup "title" d
141 , _hd_authors = Nothing
142 , _hd_institutes = lookup "authors" d
143 , _hd_source = lookup "source" d
144 , _hd_abstract = lookup "abstract" d
145 , _hd_publication_date = fmap (DT.pack . show) utcTime
146 , _hd_publication_year = pub_year
147 , _hd_publication_month = pub_month
148 , _hd_publication_day = pub_day
149 , _hd_publication_hour = Nothing
150 , _hd_publication_minute = Nothing
151 , _hd_publication_second = Nothing
152 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
153
154 enrichWith :: FileFormat
155 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
156 enrichWith RisPresse = enrichWith' presseEnrich
157 enrichWith WOS = enrichWith' (map (first WOS.keys))
158 enrichWith _ = enrichWith' identity
159
160
161 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
162 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
163 enrichWith' f = second (map both' . map f . concat)
164 where
165 both' = map (both decodeUtf8)
166
167
168
169 readFileWith :: FileFormat -> FilePath
170 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
171 readFileWith format path = do
172 files <- case takeExtension path of
173 ".zip" -> openZip path
174 _ -> pure <$> clean <$> DB.readFile path
175 partitionEithers <$> mapConcurrently (runParser format) files
176
177
178 -- | withParser:
179 -- According to the format of the text, choose the right parser.
180 -- TODO withParser :: FileFormat -> Parser [Document]
181 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
182 withParser WOS = WOS.parser
183 withParser RIS = RIS.parser
184 --withParser ODT = odtParser
185 --withParser XML = xmlParser
186 withParser _ = panic "[ERROR] Parser not implemented yet"
187
188 runParser :: FileFormat -> DB.ByteString
189 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
190 runParser format text = pure $ runParser' format text
191
192 runParser' :: FileFormat -> DB.ByteString
193 -> (Either String [[(DB.ByteString, DB.ByteString)]])
194 runParser' format text = parseOnly (withParser format) text
195
196 openZip :: FilePath -> IO [DB.ByteString]
197 openZip fp = do
198 entries <- withArchive fp (DM.keys <$> getEntries)
199 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
200 pure bs
201
202 cleanText :: Text -> Text
203 cleanText = cs . clean . cs
204
205 clean :: DB.ByteString -> DB.ByteString
206 clean txt = DBC.map clean' txt
207 where
208 clean' '’' = '\''
209 clean' '\r' = ' '
210 clean' '\t' = ' '
211 clean' ';' = '.'
212 clean' c = c