]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
Merge branch 'dev' into dev-wikidata
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Data.Attoparsec.ByteString (parseOnly, Parser)
29 import Control.Monad (join)
30 import Data.Either(Either(..))
31 import Data.Either.Extra (partitionEithers)
32 import Data.List (concat, lookup)
33 import Data.Ord()
34 import Data.String (String())
35 import Data.String()
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 import Data.Tuple.Extra (both, first, second)
39 import System.FilePath (FilePath(), takeExtension)
40 import qualified Data.ByteString as DB
41 import qualified Data.ByteString.Char8 as DBC
42 import qualified Data.ByteString.Lazy as DBL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Prelude as Prelude
46 import System.IO.Temp (emptySystemTempFile)
47
48 import Gargantext.Core (Lang(..))
49 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
50 import Gargantext.Prelude
51 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
52 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
53 -- import Gargantext.Core.Text.Learn (detectLangDefault)
54 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
55 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
56 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
57 ------------------------------------------------------------------------
58
59 type ParseError = String
60 --type Field = Text
61 --type Document = DM.Map Field Text
62 --type FilesParsed = DM.Map FilePath FileParsed
63 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
64 -- , _fileParsed_result :: [Document]
65 -- } deriving (Show)
66
67
68 -- | According to the format of Input file,
69 -- different parser are available.
70 data FileFormat = WOS | RIS | RisPresse
71 | CsvGargV3 | CsvHal
72 | ZIP
73 deriving (Show)
74
75 -- Implemented (ISI Format)
76 -- | DOC -- Not Implemented / import Pandoc
77 -- | ODT -- Not Implemented / import Pandoc
78 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
79 -- | XML -- Not Implemented / see :
80
81
82 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
83 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
84 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
85 parseFormat RisPresse bs = do
86 docs <- mapM (toDoc RIS)
87 <$> snd
88 <$> enrichWith RisPresse
89 $ partitionEithers
90 $ [runParser' RisPresse bs]
91 pure $ Right docs
92 parseFormat WOS bs = do
93 docs <- mapM (toDoc WOS)
94 <$> snd
95 <$> enrichWith WOS
96 $ partitionEithers
97 $ [runParser' WOS bs]
98 pure $ Right docs
99 parseFormat ZIP bs = do
100 path <- emptySystemTempFile "parsed-zip"
101 DB.writeFile path bs
102 parsedZip <- withArchive path $ do
103 DM.keys <$> getEntries
104 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
105 parseFormat _ _ = undefined
106
107 -- | Parse file into documents
108 -- TODO manage errors here
109 -- TODO: to debug maybe add the filepath in error message
110 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
111 parseFile CsvHal p = parseHal p
112 parseFile CsvGargV3 p = parseCsv p
113 parseFile RisPresse p = do
114 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
115 pure $ Right docs
116 parseFile WOS p = do
117 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
118 pure $ Right docs
119 parseFile ff p = do
120 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
121 pure $ Right docs
122
123 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
124 -- TODO use language for RIS
125 toDoc ff d = do
126 -- let abstract = lookup "abstract" d
127 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
128
129 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
130
131 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
132
133 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
134 , _hd_doi = lookup "doi" d
135 , _hd_url = lookup "URL" d
136 , _hd_uniqId = Nothing
137 , _hd_uniqIdBdd = Nothing
138 , _hd_page = Nothing
139 , _hd_title = lookup "title" d
140 , _hd_authors = Nothing
141 , _hd_institutes = lookup "authors" d
142 , _hd_source = lookup "source" d
143 , _hd_abstract = lookup "abstract" d
144 , _hd_publication_date = fmap (DT.pack . show) utcTime
145 , _hd_publication_year = pub_year
146 , _hd_publication_month = pub_month
147 , _hd_publication_day = pub_day
148 , _hd_publication_hour = Nothing
149 , _hd_publication_minute = Nothing
150 , _hd_publication_second = Nothing
151 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
152
153 enrichWith :: FileFormat
154 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
155 enrichWith RisPresse = enrichWith' presseEnrich
156 enrichWith WOS = enrichWith' (map (first WOS.keys))
157 enrichWith _ = enrichWith' identity
158
159
160 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
161 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
162 enrichWith' f = second (map both' . map f . concat)
163 where
164 both' = map (both decodeUtf8)
165
166
167
168 readFileWith :: FileFormat -> FilePath
169 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
170 readFileWith format path = do
171 files <- case takeExtension path of
172 ".zip" -> openZip path
173 _ -> pure <$> clean <$> DB.readFile path
174 partitionEithers <$> mapConcurrently (runParser format) files
175
176
177 -- | withParser:
178 -- According to the format of the text, choose the right parser.
179 -- TODO withParser :: FileFormat -> Parser [Document]
180 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
181 withParser WOS = WOS.parser
182 withParser RIS = RIS.parser
183 --withParser ODT = odtParser
184 --withParser XML = xmlParser
185 withParser _ = panic "[ERROR] Parser not implemented yet"
186
187 runParser :: FileFormat -> DB.ByteString
188 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
189 runParser format text = pure $ runParser' format text
190
191 runParser' :: FileFormat -> DB.ByteString
192 -> (Either String [[(DB.ByteString, DB.ByteString)]])
193 runParser' format text = parseOnly (withParser format) text
194
195 openZip :: FilePath -> IO [DB.ByteString]
196 openZip fp = do
197 entries <- withArchive fp (DM.keys <$> getEntries)
198 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
199 pure bs
200
201 cleanText :: Text -> Text
202 cleanText = cs . clean . cs
203
204 clean :: DB.ByteString -> DB.ByteString
205 clean txt = DBC.map clean' txt
206 where
207 clean' '’' = '\''
208 clean' '\r' = ' '
209 clean' '\t' = ' '
210 clean' ';' = '.'
211 clean' c = c