2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs, risPress2csv)
28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat)
35 import Data.List (lookup)
37 import Data.String (String())
39 import Data.Text (Text)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Time (UTCTime(..))
42 import Data.Tuple.Extra (both, second)
43 import System.FilePath (FilePath(), takeExtension)
44 import qualified Data.ByteString as DB
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Data.Time as DT
49 ------------------------------------------------------------------------
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Prelude
52 import Gargantext.Database.Types.Node (HyperdataDocument(..))
53 import Gargantext.Text.Parsers.WOS (wosParser)
54 import Gargantext.Text.Parsers.RIS (risParser)
55 import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
56 import Gargantext.Text.Parsers.Date (parseDate)
57 import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
58 import Gargantext.Text.Terms.Stop (detectLang)
59 ------------------------------------------------------------------------
61 type ParseError = String
63 --type Document = DM.Map Field Text
64 --type FilesParsed = DM.Map FilePath FileParsed
65 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
66 -- , _fileParsed_result :: [Document]
70 -- | According to the format of Input file,
71 -- different parser are available.
72 data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
75 -- Implemented (ISI Format)
76 -- | DOC -- Not Implemented / import Pandoc
77 -- | ODT -- Not Implemented / import Pandoc
78 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
79 -- | XML -- Not Implemented / see :
80 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
82 -- TODO: to debug maybe add the filepath in error message
85 -- | Parse file into documents
86 -- TODO manage errors here
87 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
88 parseDocs CsvHalFormat p = parseHal p
89 parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
90 parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
96 -- | Parse date to Ints
97 -- TODO add hours, minutes and seconds
98 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
99 parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
100 parseDate' l (Just txt) = do
101 utcTime <- parseDate l txt
102 let (UTCTime day _) = utcTime
103 let (y,m,d) = DT.toGregorian day
104 pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
107 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
108 -- TODO use language for RIS
110 let abstract = lookup "abstract" d
111 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
113 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
115 (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
117 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
127 (lookup "abstract" d)
128 (fmap (DT.pack . show) utcTime)
135 (Just $ (DT.pack . show) lang)
137 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
138 parse ff fp = enrichWith identity <$> parse' ff fp
141 ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
142 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
143 enrichWith f = second (map both' . map f . concat)
145 both' = map (both decodeUtf8)
147 parse' :: FileFormat -> FilePath
148 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
149 parse' format path = do
150 files <- case takeExtension path of
151 ".zip" -> openZip path
152 _ -> pure <$> DB.readFile path
153 partitionEithers <$> mapConcurrently (runParser format) files
158 -- According to the format of the text, choose the right parser.
159 -- TODO withParser :: FileFormat -> Parser [Document]
160 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
161 withParser WOS = wosParser
162 withParser RIS = risParser
163 --withParser ODT = odtParser
164 --withParser XML = xmlParser
165 withParser _ = panic "[ERROR] Parser not implemented yet"
167 runParser :: FileFormat -> DB.ByteString
168 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
169 runParser format text = pure $ parseOnly (withParser format) text
171 openZip :: FilePath -> IO [DB.ByteString]
173 entries <- withArchive fp (DM.keys <$> getEntries)
174 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
177 clean :: Text -> Text
178 clean txt = DT.map clean' txt
185 risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs