]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
Merge branch '70-dev-searx-parser' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join)
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import Data.Either(Either(..))
31 import Data.Either.Extra (partitionEithers)
32 import Data.List (concat, lookup)
33 import Data.Ord()
34 import Data.String (String())
35 import Data.String()
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 import Data.Tuple.Extra (both, first, second)
39 import System.FilePath (FilePath(), takeExtension)
40 import qualified Data.ByteString as DB
41 import qualified Data.ByteString.Char8 as DBC
42 import qualified Data.ByteString.Lazy as DBL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Prelude as Prelude
46
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
49 import Gargantext.Prelude
50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
57
58 type ParseError = String
59 --type Field = Text
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
64 -- } deriving (Show)
65
66
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
70 | CsvGargV3 | CsvHal
71 deriving (Show)
72
73 -- Implemented (ISI Format)
74 -- | DOC -- Not Implemented / import Pandoc
75 -- | ODT -- Not Implemented / import Pandoc
76 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
77 -- | XML -- Not Implemented / see :
78
79
80 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
81 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
82 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
83 parseFormat RisPresse bs = do
84 docs <- mapM (toDoc RIS)
85 <$> snd
86 <$> enrichWith RisPresse
87 $ partitionEithers
88 $ [runParser' RisPresse bs]
89 pure $ Right docs
90 parseFormat WOS bs = do
91 docs <- mapM (toDoc WOS)
92 <$> snd
93 <$> enrichWith WOS
94 $ partitionEithers
95 $ [runParser' WOS bs]
96 pure $ Right docs
97 parseFormat _ _ = undefined
98
99 -- | Parse file into documents
100 -- TODO manage errors here
101 -- TODO: to debug maybe add the filepath in error message
102 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
103 parseFile CsvHal p = parseHal p
104 parseFile CsvGargV3 p = parseCsv p
105 parseFile RisPresse p = do
106 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
107 pure $ Right docs
108 parseFile WOS p = do
109 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
110 pure $ Right docs
111 parseFile ff p = do
112 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
113 pure $ Right docs
114
115 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
116 -- TODO use language for RIS
117 toDoc ff d = do
118 -- let abstract = lookup "abstract" d
119 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
120
121 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
122
123 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
124
125 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
126 (lookup "doi" d)
127 (lookup "URL" d)
128 Nothing
129 Nothing
130 Nothing
131 (lookup "title" d)
132 Nothing
133 (lookup "authors" d)
134 (lookup "source" d)
135 (lookup "abstract" d)
136 (fmap (DT.pack . show) utcTime)
137 (pub_year)
138 (pub_month)
139 (pub_day)
140 Nothing
141 Nothing
142 Nothing
143 (Just $ (DT.pack . show) lang)
144
145 enrichWith :: FileFormat
146 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
147 enrichWith RisPresse = enrichWith' presseEnrich
148 enrichWith WOS = enrichWith' (map (first WOS.keys))
149 enrichWith _ = enrichWith' identity
150
151
152 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
153 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
154 enrichWith' f = second (map both' . map f . concat)
155 where
156 both' = map (both decodeUtf8)
157
158
159
160 readFileWith :: FileFormat -> FilePath
161 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
162 readFileWith format path = do
163 files <- case takeExtension path of
164 ".zip" -> openZip path
165 _ -> pure <$> clean <$> DB.readFile path
166 partitionEithers <$> mapConcurrently (runParser format) files
167
168
169 -- | withParser:
170 -- According to the format of the text, choose the right parser.
171 -- TODO withParser :: FileFormat -> Parser [Document]
172 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
173 withParser WOS = WOS.parser
174 withParser RIS = RIS.parser
175 --withParser ODT = odtParser
176 --withParser XML = xmlParser
177 withParser _ = panic "[ERROR] Parser not implemented yet"
178
179 runParser :: FileFormat -> DB.ByteString
180 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
181 runParser format text = pure $ runParser' format text
182
183 runParser' :: FileFormat -> DB.ByteString
184 -> (Either String [[(DB.ByteString, DB.ByteString)]])
185 runParser' format text = parseOnly (withParser format) text
186
187 openZip :: FilePath -> IO [DB.ByteString]
188 openZip fp = do
189 entries <- withArchive fp (DM.keys <$> getEntries)
190 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
191 pure bs
192
193 cleanText :: Text -> Text
194 cleanText = cs . clean . cs
195
196 clean :: DB.ByteString -> DB.ByteString
197 clean txt = DBC.map clean' txt
198 where
199 clean' '’' = '\''
200 clean' '\r' = ' '
201 clean' '\t' = ' '
202 clean' ';' = '.'
203 clean' c = c