]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers.hs
[docker] update image, add README info
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 where
27
28 --import Data.ByteString (ByteString)
29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30 import Control.Concurrent.Async as CCA (mapConcurrently)
31 import Control.Monad (join)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
36 import Data.Ord()
37 import Data.String (String())
38 import Data.String()
39 import Data.Text (Text)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.Core (Lang(..))
43 import Gargantext.Database.Types.Node (HyperdataDocument(..))
44 import Gargantext.Prelude
45 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
46 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
47 import Gargantext.Text.Learn (detectLangDefault)
48 import System.FilePath (FilePath(), takeExtension)
49 import qualified Data.ByteString as DB
50 import qualified Data.ByteString.Lazy as DBL
51 import qualified Data.ByteString.Char8 as DBC
52 import qualified Data.Map as DM
53 import qualified Data.Text as DT
54 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
55 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
56 import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
57 ------------------------------------------------------------------------
58
59 type ParseError = String
60 --type Field = Text
61 --type Document = DM.Map Field Text
62 --type FilesParsed = DM.Map FilePath FileParsed
63 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
64 -- , _fileParsed_result :: [Document]
65 -- } deriving (Show)
66
67
68 -- | According to the format of Input file,
69 -- different parser are available.
70 data FileFormat = WOS | RIS | RisPresse
71 | CsvGargV3 | CsvHal
72 deriving (Show)
73
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
79
80
81 parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
83 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
84 parseFormat RisPresse bs = mapM (toDoc RIS)
85 <$> snd
86 <$> enrichWith RisPresse
87 $ partitionEithers
88 $ [runParser' RisPresse bs]
89 parseFormat WOS bs = mapM (toDoc WOS)
90 <$> snd
91 <$> enrichWith WOS
92 $ partitionEithers
93 $ [runParser' WOS bs]
94 parseFormat _ _ = undefined
95
96 -- | Parse file into documents
97 -- TODO manage errors here
98 -- TODO: to debug maybe add the filepath in error message
99 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
100 parseFile CsvHal p = parseHal p
101 parseFile CsvGargV3 p = parseCsv p
102 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
103 parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
104 parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
105
106 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
107 -- TODO use language for RIS
108 toDoc ff d = do
109 let abstract = lookup "abstract" d
110 let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
111
112 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
113
114 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
115
116 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
117 (lookup "doi" d)
118 (lookup "URL" d)
119 Nothing
120 Nothing
121 Nothing
122 (lookup "title" d)
123 Nothing
124 (lookup "authors" d)
125 (lookup "source" d)
126 (lookup "abstract" d)
127 (fmap (DT.pack . show) utcTime)
128 (pub_year)
129 (pub_month)
130 (pub_day)
131 Nothing
132 Nothing
133 Nothing
134 (Just $ (DT.pack . show) lang)
135
136 enrichWith :: FileFormat
137 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
138 enrichWith RisPresse = enrichWith' presseEnrich
139 enrichWith WOS = enrichWith' (map (first WOS.keys))
140 enrichWith _ = enrichWith' identity
141
142
143 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
144 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
145 enrichWith' f = second (map both' . map f . concat)
146 where
147 both' = map (both decodeUtf8)
148
149
150
151 readFileWith :: FileFormat -> FilePath
152 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
153 readFileWith format path = do
154 files <- case takeExtension path of
155 ".zip" -> openZip path
156 _ -> pure <$> clean <$> DB.readFile path
157 partitionEithers <$> mapConcurrently (runParser format) files
158
159
160 -- | withParser:
161 -- According to the format of the text, choose the right parser.
162 -- TODO withParser :: FileFormat -> Parser [Document]
163 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
164 withParser WOS = WOS.parser
165 withParser RIS = RIS.parser
166 --withParser ODT = odtParser
167 --withParser XML = xmlParser
168 withParser _ = panic "[ERROR] Parser not implemented yet"
169
170 runParser :: FileFormat -> DB.ByteString
171 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
172 runParser format text = pure $ runParser' format text
173
174 runParser' :: FileFormat -> DB.ByteString
175 -> (Either String [[(DB.ByteString, DB.ByteString)]])
176 runParser' format text = parseOnly (withParser format) text
177
178 openZip :: FilePath -> IO [DB.ByteString]
179 openZip fp = do
180 entries <- withArchive fp (DM.keys <$> getEntries)
181 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
182 pure bs
183
184 cleanText :: Text -> Text
185 cleanText = cs . clean . cs
186
187 clean :: DB.ByteString -> DB.ByteString
188 clean txt = DBC.map clean' txt
189 where
190 clean' '’' = '\''
191 clean' '\r' = ' '
192 clean' '\t' = ' '
193 clean' ';' = '.'
194 clean' c = c