]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
[PARSERS] RIS (WIP).
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
26 where
27
28 import System.FilePath (FilePath(), takeExtension)
29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30
31 import Control.Monad (join)
32 import qualified Data.Time as DT
33 import Data.Either.Extra (partitionEithers)
34 import Data.Time (UTCTime(..))
35 import Data.List (concat)
36 import qualified Data.Map as DM
37 import qualified Data.ByteString as DB
38 import Data.Ord()
39 import Data.String()
40 import Data.Either(Either(..))
41 import Data.Attoparsec.ByteString (parseOnly, Parser)
42
43 import Data.Text (Text)
44 import qualified Data.Text as DT
45
46 -- Activate Async for to parse in parallel
47 import Control.Concurrent.Async as CCA (mapConcurrently)
48
49 import Data.Text.Encoding (decodeUtf8)
50 import Data.String (String())
51 import Data.List (lookup)
52
53 ------------------------------------------------------------------------
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Prelude
56 import Gargantext.Database.Types.Node (HyperdataDocument(..))
57 import Gargantext.Text.Parsers.WOS (wosParser)
58 import Gargantext.Text.Parsers.RIS (risParser)
59 import Gargantext.Text.Parsers.Date (parseDate)
60 import Gargantext.Text.Parsers.CSV (parseHal)
61 import Gargantext.Text.Terms.Stop (detectLang)
62 ------------------------------------------------------------------------
63
64 type ParseError = String
65 --type Field = Text
66 --type Document = DM.Map Field Text
67 --type FilesParsed = DM.Map FilePath FileParsed
68 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
69 -- , _fileParsed_result :: [Document]
70 -- } deriving (Show)
71
72
73 -- | According to the format of Input file,
74 -- different parser are available.
75 data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
76 deriving (Show)
77
78 -- Implemented (ISI Format)
79 -- | DOC -- Not Implemented / import Pandoc
80 -- | ODT -- Not Implemented / import Pandoc
81 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
82 -- | XML -- Not Implemented / see :
83 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
84
85 -- TODO: to debug maybe add the filepath in error message
86
87
88 -- | Parse file into documents
89 -- TODO manage errors here
90 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
91 parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
92 parseDocs CsvHalFormat p = parseHal p
93
94 type Year = Int
95 type Month = Int
96 type Day = Int
97
98 -- | Parse date to Ints
99 -- TODO add hours, minutes and seconds
100 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
101 parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
102 parseDate' l (Just txt) = do
103 utcTime <- parseDate l txt
104 let (UTCTime day _) = utcTime
105 let (y,m,d) = DT.toGregorian day
106 pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
107
108
109 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
110 -- TODO use language for RIS
111 toDoc ff d = do
112 let abstract = lookup "abstract" d
113 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
114
115 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
116
117 (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
118
119 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
120 (lookup "doi" d)
121 (lookup "URL" d)
122 Nothing
123 Nothing
124 Nothing
125 (lookup "title" d)
126 Nothing
127 (lookup "authors" d)
128 (lookup "source" d)
129 (lookup "abstract" d)
130 (fmap (DT.pack . show) utcTime)
131 (pub_year)
132 (pub_month)
133 (pub_day)
134 Nothing
135 Nothing
136 Nothing
137 (Just $ (DT.pack . show) lang)
138 toDoc _ _ = undefined
139
140 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
141 parse format path = do
142 files <- case takeExtension path of
143 ".zip" -> openZip path
144 _ -> pure <$> DB.readFile path
145 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
146 pure (as, map toText $ concat bs)
147 where
148 -- TODO : decode with bayesian inference on encodings
149 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
150
151
152 -- | withParser:
153 -- According to the format of the text, choose the right parser.
154 -- TODO withParser :: FileFormat -> Parser [Document]
155 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
156 withParser WOS = wosParser
157 withParser RIS = risParser
158 --withParser ODT = odtParser
159 --withParser XML = xmlParser
160 withParser _ = panic "[ERROR] Parser not implemented yet"
161
162 runParser :: FileFormat -> DB.ByteString
163 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
164 runParser format text = pure $ parseOnly (withParser format) text
165
166 openZip :: FilePath -> IO [DB.ByteString]
167 openZip fp = do
168 entries <- withArchive fp (DM.keys <$> getEntries)
169 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
170 pure bs
171
172 clean :: Text -> Text
173 clean txt = DT.map clean' txt
174 where
175 clean' '’' = '\''
176 clean' c = c
177