]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
[FEAT] Parsers RIS && Presse.
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs, risPress2csv)
26 where
27
28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat)
35 import Data.List (lookup)
36 import Data.Ord()
37 import Data.String (String())
38 import Data.String()
39 import Data.Text (Text)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Time (UTCTime(..))
42 import Data.Tuple.Extra (both, second)
43 import System.FilePath (FilePath(), takeExtension)
44 import qualified Data.ByteString as DB
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Data.Time as DT
48
49 ------------------------------------------------------------------------
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Prelude
52 import Gargantext.Database.Types.Node (HyperdataDocument(..))
53 import Gargantext.Text.Parsers.WOS (wosParser)
54 import Gargantext.Text.Parsers.RIS (risParser, presseParser)
55 import Gargantext.Text.Parsers.Date (parseDate)
56 import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
57 import Gargantext.Text.Terms.Stop (detectLang)
58 ------------------------------------------------------------------------
59
60 type ParseError = String
61 --type Field = Text
62 --type Document = DM.Map Field Text
63 --type FilesParsed = DM.Map FilePath FileParsed
64 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
65 -- , _fileParsed_result :: [Document]
66 -- } deriving (Show)
67
68
69 -- | According to the format of Input file,
70 -- different parser are available.
71 data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
72 deriving (Show)
73
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
79 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
80
81 -- TODO: to debug maybe add the filepath in error message
82
83
84 -- | Parse file into documents
85 -- TODO manage errors here
86 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
87 parseDocs CsvHalFormat p = parseHal p
88 parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseParser <$> parse' RIS p
89 parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
90
91 type Year = Int
92 type Month = Int
93 type Day = Int
94
95 -- | Parse date to Ints
96 -- TODO add hours, minutes and seconds
97 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
98 parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
99 parseDate' l (Just txt) = do
100 utcTime <- parseDate l txt
101 let (UTCTime day _) = utcTime
102 let (y,m,d) = DT.toGregorian day
103 pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
104
105
106 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
107 -- TODO use language for RIS
108 toDoc ff d = do
109 let abstract = lookup "abstract" d
110 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
111
112 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
113
114 (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
115
116 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
117 (lookup "doi" d)
118 (lookup "URL" d)
119 Nothing
120 Nothing
121 Nothing
122 (lookup "title" d)
123 Nothing
124 (lookup "authors" d)
125 (lookup "source" d)
126 (lookup "abstract" d)
127 (fmap (DT.pack . show) utcTime)
128 (pub_year)
129 (pub_month)
130 (pub_day)
131 Nothing
132 Nothing
133 Nothing
134 (Just $ (DT.pack . show) lang)
135 toDoc _ _ = undefined
136
137 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
138 parse ff fp = enrichWith identity <$> parse' ff fp
139
140 enrichWith ::
141 ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
142 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
143 enrichWith f = second (map both' . map f . concat)
144 where
145 both' = map (both decodeUtf8)
146
147 parse' :: FileFormat -> FilePath
148 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
149 parse' format path = do
150 files <- case takeExtension path of
151 ".zip" -> openZip path
152 _ -> pure <$> DB.readFile path
153 partitionEithers <$> mapConcurrently (runParser format) files
154
155
156
157 -- | withParser:
158 -- According to the format of the text, choose the right parser.
159 -- TODO withParser :: FileFormat -> Parser [Document]
160 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
161 withParser WOS = wosParser
162 withParser RIS = risParser
163 --withParser ODT = odtParser
164 --withParser XML = xmlParser
165 withParser _ = panic "[ERROR] Parser not implemented yet"
166
167 runParser :: FileFormat -> DB.ByteString
168 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
169 runParser format text = pure $ parseOnly (withParser format) text
170
171 openZip :: FilePath -> IO [DB.ByteString]
172 openZip fp = do
173 entries <- withArchive fp (DM.keys <$> getEntries)
174 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
175 pure bs
176
177 clean :: Text -> Text
178 clean txt = DT.map clean' txt
179 where
180 clean' '’' = '\''
181 clean' c = c
182
183
184
185 risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
186
187