]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
Merge branch 'dev-np' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantext...
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
26 where
27
28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import qualified Data.ByteString.Char8 as DBC
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat)
36 import Data.List (lookup)
37 import Data.Ord()
38 import Data.String (String())
39 import Data.String()
40 import Data.Text (Text)
41 import Data.Text.Encoding (decodeUtf8)
42 import Data.Time (UTCTime(..))
43 import Data.Tuple.Extra (both, first, second)
44 import System.FilePath (FilePath(), takeExtension)
45 import qualified Data.ByteString as DB
46 import qualified Data.Map as DM
47 import qualified Data.Text as DT
48 import qualified Data.Time as DT
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Prelude
51 import Gargantext.Database.Types.Node (HyperdataDocument(..))
52 import qualified Gargantext.Text.Parsers.WOS as WOS
53 import qualified Gargantext.Text.Parsers.RIS as RIS
54 import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
55 import Gargantext.Text.Parsers.Date (parseDate)
56 import Gargantext.Text.Parsers.CSV (parseHal)
57 import Gargantext.Text.Terms.Stop (detectLang)
58 ------------------------------------------------------------------------
59
60 type ParseError = String
61 --type Field = Text
62 --type Document = DM.Map Field Text
63 --type FilesParsed = DM.Map FilePath FileParsed
64 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
65 -- , _fileParsed_result :: [Document]
66 -- } deriving (Show)
67
68
69 -- | According to the format of Input file,
70 -- different parser are available.
71 data FileFormat = WOS | RIS | RisPresse
72 | CsvGargV3 | CsvHalFormat
73 deriving (Show)
74
75 -- Implemented (ISI Format)
76 -- | DOC -- Not Implemented / import Pandoc
77 -- | ODT -- Not Implemented / import Pandoc
78 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
79 -- | XML -- Not Implemented / see :
80 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
81
82 -- TODO: to debug maybe add the filepath in error message
83
84
85 -- | Parse file into documents
86 -- TODO manage errors here
87 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
88 parseDocs CsvHalFormat p = parseHal p
89 parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
90 parseDocs WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith (map (first WOS.keys)) <$> parse' WOS p
91 parseDocs ff p = join $ mapM (toDoc ff) <$> snd <$> parse ff p
92
93 type Year = Int
94 type Month = Int
95 type Day = Int
96
97 -- | Parse date to Ints
98 -- TODO add hours, minutes and seconds
99 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
100 parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
101 parseDate' l (Just txt) = do
102 utcTime <- parseDate l txt
103 let (UTCTime day _) = utcTime
104 let (y,m,d) = DT.toGregorian day
105 pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
106
107
108 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
109 -- TODO use language for RIS
110 toDoc ff d = do
111 let abstract = lookup "abstract" d
112 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
113
114 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
115
116 (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
117
118 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
119 (lookup "doi" d)
120 (lookup "URL" d)
121 Nothing
122 Nothing
123 Nothing
124 (lookup "title" d)
125 Nothing
126 (lookup "authors" d)
127 (lookup "source" d)
128 (lookup "abstract" d)
129 (fmap (DT.pack . show) utcTime)
130 (pub_year)
131 (pub_month)
132 (pub_day)
133 Nothing
134 Nothing
135 Nothing
136 (Just $ (DT.pack . show) lang)
137
138 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
139 parse ff fp = enrichWith identity <$> parse' ff fp
140
141 enrichWith ::
142 ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
143 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
144 enrichWith f = second (map both' . map f . concat)
145 where
146 both' = map (both decodeUtf8)
147
148 parse' :: FileFormat -> FilePath
149 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
150 parse' format path = do
151 files <- case takeExtension path of
152 ".zip" -> openZip path
153 _ -> pure <$> clean <$> DB.readFile path
154 partitionEithers <$> mapConcurrently (runParser format) files
155
156
157
158 -- | withParser:
159 -- According to the format of the text, choose the right parser.
160 -- TODO withParser :: FileFormat -> Parser [Document]
161 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
162 withParser WOS = WOS.parser
163 withParser RIS = RIS.parser
164 --withParser ODT = odtParser
165 --withParser XML = xmlParser
166 withParser _ = panic "[ERROR] Parser not implemented yet"
167
168 runParser :: FileFormat -> DB.ByteString
169 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
170 runParser format text = pure $ parseOnly (withParser format) text
171
172 openZip :: FilePath -> IO [DB.ByteString]
173 openZip fp = do
174 entries <- withArchive fp (DM.keys <$> getEntries)
175 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
176 pure bs
177
178 clean :: DB.ByteString -> DB.ByteString
179 clean txt = DBC.map clean' txt
180 where
181 clean' '’' = '\''
182 clean' '\r' = ' '
183 clean' c = c
184
185