]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[FIX] Xml parsers
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join)
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import Data.Either(Either(..))
31 import Data.Either.Extra (partitionEithers)
32 import Data.List (concat, lookup)
33 import Data.Ord()
34 import Data.String (String())
35 import Data.String()
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 import Data.Tuple.Extra (both, first, second)
39 import Gargantext.Core (Lang(..))
40 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
41 import Gargantext.Prelude
42 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
43 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
44 import Gargantext.Core.Text.Learn (detectLangDefault)
45 import System.FilePath (FilePath(), takeExtension)
46 import qualified Data.ByteString as DB
47 import qualified Data.ByteString.Char8 as DBC
48 import qualified Data.ByteString.Lazy as DBL
49 import qualified Data.Map as DM
50 import qualified Data.Text as DT
51 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
52 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
53 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
54 ------------------------------------------------------------------------
55
56 type ParseError = String
57 --type Field = Text
58 --type Document = DM.Map Field Text
59 --type FilesParsed = DM.Map FilePath FileParsed
60 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
61 -- , _fileParsed_result :: [Document]
62 -- } deriving (Show)
63
64
65 -- | According to the format of Input file,
66 -- different parser are available.
67 data FileFormat = WOS | RIS | RisPresse
68 | CsvGargV3 | CsvHal
69 deriving (Show)
70
71 -- Implemented (ISI Format)
72 -- | DOC -- Not Implemented / import Pandoc
73 -- | ODT -- Not Implemented / import Pandoc
74 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
75 -- | XML -- Not Implemented / see :
76
77
78 parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
79 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
80 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
81 parseFormat RisPresse bs = mapM (toDoc RIS)
82 <$> snd
83 <$> enrichWith RisPresse
84 $ partitionEithers
85 $ [runParser' RisPresse bs]
86 parseFormat WOS bs = mapM (toDoc WOS)
87 <$> snd
88 <$> enrichWith WOS
89 $ partitionEithers
90 $ [runParser' WOS bs]
91 parseFormat _ _ = undefined
92
93 -- | Parse file into documents
94 -- TODO manage errors here
95 -- TODO: to debug maybe add the filepath in error message
96 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
97 parseFile CsvHal p = parseHal p
98 parseFile CsvGargV3 p = parseCsv p
99 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
100 parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
101 parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
102
103 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
104 -- TODO use language for RIS
105 toDoc ff d = do
106 let abstract = lookup "abstract" d
107 let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
108
109 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
110
111 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
112
113 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
114 (lookup "doi" d)
115 (lookup "URL" d)
116 Nothing
117 Nothing
118 Nothing
119 (lookup "title" d)
120 Nothing
121 (lookup "authors" d)
122 (lookup "source" d)
123 (lookup "abstract" d)
124 (fmap (DT.pack . show) utcTime)
125 (pub_year)
126 (pub_month)
127 (pub_day)
128 Nothing
129 Nothing
130 Nothing
131 (Just $ (DT.pack . show) lang)
132
133 enrichWith :: FileFormat
134 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
135 enrichWith RisPresse = enrichWith' presseEnrich
136 enrichWith WOS = enrichWith' (map (first WOS.keys))
137 enrichWith _ = enrichWith' identity
138
139
140 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
141 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
142 enrichWith' f = second (map both' . map f . concat)
143 where
144 both' = map (both decodeUtf8)
145
146
147
148 readFileWith :: FileFormat -> FilePath
149 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
150 readFileWith format path = do
151 files <- case takeExtension path of
152 ".zip" -> openZip path
153 _ -> pure <$> clean <$> DB.readFile path
154 partitionEithers <$> mapConcurrently (runParser format) files
155
156
157 -- | withParser:
158 -- According to the format of the text, choose the right parser.
159 -- TODO withParser :: FileFormat -> Parser [Document]
160 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
161 withParser WOS = WOS.parser
162 withParser RIS = RIS.parser
163 --withParser ODT = odtParser
164 --withParser XML = xmlParser
165 withParser _ = panic "[ERROR] Parser not implemented yet"
166
167 runParser :: FileFormat -> DB.ByteString
168 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
169 runParser format text = pure $ runParser' format text
170
171 runParser' :: FileFormat -> DB.ByteString
172 -> (Either String [[(DB.ByteString, DB.ByteString)]])
173 runParser' format text = parseOnly (withParser format) text
174
175 openZip :: FilePath -> IO [DB.ByteString]
176 openZip fp = do
177 entries <- withArchive fp (DM.keys <$> getEntries)
178 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
179 pure bs
180
181 cleanText :: Text -> Text
182 cleanText = cs . clean . cs
183
184 clean :: DB.ByteString -> DB.ByteString
185 clean txt = DBC.map clean' txt
186 where
187 clean' '’' = '\''
188 clean' '\r' = ' '
189 clean' '\t' = ' '
190 clean' ';' = '.'
191 clean' c = c