]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[search] fix for plainto_tsquery for docs search
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join)
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import Data.Either(Either(..))
31 import Data.Either.Extra (partitionEithers)
32 import Data.List (concat, lookup)
33 import Data.Ord()
34 import Data.String (String())
35 import Data.String()
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 import Data.Tuple.Extra (both, first, second)
39 import System.FilePath (FilePath(), takeExtension)
40 import qualified Data.ByteString as DB
41 import qualified Data.ByteString.Char8 as DBC
42 import qualified Data.ByteString.Lazy as DBL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Prelude as Prelude
46
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
49 import Gargantext.Prelude
50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
57
58 type ParseError = String
59 --type Field = Text
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
64 -- } deriving (Show)
65
66
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
70 | CsvGargV3 | CsvHal
71 | ZIP
72 deriving (Show)
73
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
79
80
81 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
83 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
84 parseFormat RisPresse bs = do
85 docs <- mapM (toDoc RIS)
86 <$> snd
87 <$> enrichWith RisPresse
88 $ partitionEithers
89 $ [runParser' RisPresse bs]
90 pure $ Right docs
91 parseFormat WOS bs = do
92 docs <- mapM (toDoc WOS)
93 <$> snd
94 <$> enrichWith WOS
95 $ partitionEithers
96 $ [runParser' WOS bs]
97 pure $ Right docs
98 parseFormat ZIP _bs = do
99 printDebug "[parseFormat]" ZIP
100 pure $ Left "Not implemented for ZIP"
101 parseFormat _ _ = undefined
102
103 -- | Parse file into documents
104 -- TODO manage errors here
105 -- TODO: to debug maybe add the filepath in error message
106 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
107 parseFile CsvHal p = parseHal p
108 parseFile CsvGargV3 p = parseCsv p
109 parseFile RisPresse p = do
110 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
111 pure $ Right docs
112 parseFile WOS p = do
113 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
114 pure $ Right docs
115 parseFile ff p = do
116 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
117 pure $ Right docs
118
119 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
120 -- TODO use language for RIS
121 toDoc ff d = do
122 -- let abstract = lookup "abstract" d
123 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
124
125 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
126
127 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
128
129 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
130 , _hd_doi = lookup "doi" d
131 , _hd_url = lookup "URL" d
132 , _hd_uniqId = Nothing
133 , _hd_uniqIdBdd = Nothing
134 , _hd_page = Nothing
135 , _hd_title = lookup "title" d
136 , _hd_authors = Nothing
137 , _hd_institutes = lookup "authors" d
138 , _hd_source = lookup "source" d
139 , _hd_abstract = lookup "abstract" d
140 , _hd_publication_date = fmap (DT.pack . show) utcTime
141 , _hd_publication_year = pub_year
142 , _hd_publication_month = pub_month
143 , _hd_publication_day = pub_day
144 , _hd_publication_hour = Nothing
145 , _hd_publication_minute = Nothing
146 , _hd_publication_second = Nothing
147 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
148
149 enrichWith :: FileFormat
150 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
151 enrichWith RisPresse = enrichWith' presseEnrich
152 enrichWith WOS = enrichWith' (map (first WOS.keys))
153 enrichWith _ = enrichWith' identity
154
155
156 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
157 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
158 enrichWith' f = second (map both' . map f . concat)
159 where
160 both' = map (both decodeUtf8)
161
162
163
164 readFileWith :: FileFormat -> FilePath
165 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
166 readFileWith format path = do
167 files <- case takeExtension path of
168 ".zip" -> openZip path
169 _ -> pure <$> clean <$> DB.readFile path
170 partitionEithers <$> mapConcurrently (runParser format) files
171
172
173 -- | withParser:
174 -- According to the format of the text, choose the right parser.
175 -- TODO withParser :: FileFormat -> Parser [Document]
176 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
177 withParser WOS = WOS.parser
178 withParser RIS = RIS.parser
179 --withParser ODT = odtParser
180 --withParser XML = xmlParser
181 withParser _ = panic "[ERROR] Parser not implemented yet"
182
183 runParser :: FileFormat -> DB.ByteString
184 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
185 runParser format text = pure $ runParser' format text
186
187 runParser' :: FileFormat -> DB.ByteString
188 -> (Either String [[(DB.ByteString, DB.ByteString)]])
189 runParser' format text = parseOnly (withParser format) text
190
191 openZip :: FilePath -> IO [DB.ByteString]
192 openZip fp = do
193 entries <- withArchive fp (DM.keys <$> getEntries)
194 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
195 pure bs
196
197 cleanText :: Text -> Text
198 cleanText = cs . clean . cs
199
200 clean :: DB.ByteString -> DB.ByteString
201 clean txt = DBC.map clean' txt
202 where
203 clean' '’' = '\''
204 clean' '\r' = ' '
205 clean' '\t' = ' '
206 clean' ';' = '.'
207 clean' c = c