]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers.hs
[Clean] before factoring
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 where
27
28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
35 import Data.Ord()
36 import Data.String (String())
37 import Data.String()
38 import Data.Text (Text)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import Gargantext.Core (Lang(..))
42 import Gargantext.Database.Admin.Types.Node (HyperdataDocument(..))
43 import Gargantext.Prelude
44 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
45 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Text.Learn (detectLangDefault)
47 import System.FilePath (FilePath(), takeExtension)
48 import qualified Data.ByteString as DB
49 import qualified Data.ByteString.Char8 as DBC
50 import qualified Data.ByteString.Lazy as DBL
51 import qualified Data.Map as DM
52 import qualified Data.Text as DT
53 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
57
58 type ParseError = String
59 --type Field = Text
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
64 -- } deriving (Show)
65
66
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
70 | CsvGargV3 | CsvHal
71 deriving (Show)
72
73 -- Implemented (ISI Format)
74 -- | DOC -- Not Implemented / import Pandoc
75 -- | ODT -- Not Implemented / import Pandoc
76 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
77 -- | XML -- Not Implemented / see :
78
79
80 parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
81 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
82 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
83 parseFormat RisPresse bs = mapM (toDoc RIS)
84 <$> snd
85 <$> enrichWith RisPresse
86 $ partitionEithers
87 $ [runParser' RisPresse bs]
88 parseFormat WOS bs = mapM (toDoc WOS)
89 <$> snd
90 <$> enrichWith WOS
91 $ partitionEithers
92 $ [runParser' WOS bs]
93 parseFormat _ _ = undefined
94
95 -- | Parse file into documents
96 -- TODO manage errors here
97 -- TODO: to debug maybe add the filepath in error message
98 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
99 parseFile CsvHal p = parseHal p
100 parseFile CsvGargV3 p = parseCsv p
101 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
102 parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
103 parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
104
105 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
106 -- TODO use language for RIS
107 toDoc ff d = do
108 let abstract = lookup "abstract" d
109 let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
110
111 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
112
113 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
114
115 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
116 (lookup "doi" d)
117 (lookup "URL" d)
118 Nothing
119 Nothing
120 Nothing
121 (lookup "title" d)
122 Nothing
123 (lookup "authors" d)
124 (lookup "source" d)
125 (lookup "abstract" d)
126 (fmap (DT.pack . show) utcTime)
127 (pub_year)
128 (pub_month)
129 (pub_day)
130 Nothing
131 Nothing
132 Nothing
133 (Just $ (DT.pack . show) lang)
134
135 enrichWith :: FileFormat
136 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
137 enrichWith RisPresse = enrichWith' presseEnrich
138 enrichWith WOS = enrichWith' (map (first WOS.keys))
139 enrichWith _ = enrichWith' identity
140
141
142 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
143 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
144 enrichWith' f = second (map both' . map f . concat)
145 where
146 both' = map (both decodeUtf8)
147
148
149
150 readFileWith :: FileFormat -> FilePath
151 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
152 readFileWith format path = do
153 files <- case takeExtension path of
154 ".zip" -> openZip path
155 _ -> pure <$> clean <$> DB.readFile path
156 partitionEithers <$> mapConcurrently (runParser format) files
157
158
159 -- | withParser:
160 -- According to the format of the text, choose the right parser.
161 -- TODO withParser :: FileFormat -> Parser [Document]
162 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
163 withParser WOS = WOS.parser
164 withParser RIS = RIS.parser
165 --withParser ODT = odtParser
166 --withParser XML = xmlParser
167 withParser _ = panic "[ERROR] Parser not implemented yet"
168
169 runParser :: FileFormat -> DB.ByteString
170 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
171 runParser format text = pure $ runParser' format text
172
173 runParser' :: FileFormat -> DB.ByteString
174 -> (Either String [[(DB.ByteString, DB.ByteString)]])
175 runParser' format text = parseOnly (withParser format) text
176
177 openZip :: FilePath -> IO [DB.ByteString]
178 openZip fp = do
179 entries <- withArchive fp (DM.keys <$> getEntries)
180 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
181 pure bs
182
183 cleanText :: Text -> Text
184 cleanText = cs . clean . cs
185
186 clean :: DB.ByteString -> DB.ByteString
187 clean txt = DBC.map clean' txt
188 where
189 clean' '’' = '\''
190 clean' '\r' = ' '
191 clean' '\t' = ' '
192 clean' ';' = '.'
193 clean' c = c