]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers.hs
[PHYLO] merge
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText)
26 where
27
28 --import Data.ByteString (ByteString)
29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30 import Control.Concurrent.Async as CCA (mapConcurrently)
31 import Control.Monad (join)
32 import qualified Data.ByteString.Char8 as DBC
33 import Data.Attoparsec.ByteString (parseOnly, Parser)
34 import Data.Either(Either(..))
35 import Data.Either.Extra (partitionEithers)
36 import Data.List (concat)
37 import Data.List (lookup)
38 import Data.Ord()
39 import Data.String (String())
40 import Data.String()
41 import Data.Text (Text)
42 import Data.Text.Encoding (decodeUtf8)
43 import Data.Tuple.Extra (both, first, second)
44 import System.FilePath (FilePath(), takeExtension)
45 import qualified Data.ByteString as DB
46 import qualified Data.Map as DM
47 import qualified Data.Text as DT
48 import Gargantext.Core (Lang(..))
49 import Gargantext.Prelude
50 import Gargantext.Database.Types.Node (HyperdataDocument(..))
51 import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
52 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
53 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
54 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
55 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
56 import Gargantext.Text.Terms.Stop (detectLang)
57 ------------------------------------------------------------------------
58
59 type ParseError = String
60 --type Field = Text
61 --type Document = DM.Map Field Text
62 --type FilesParsed = DM.Map FilePath FileParsed
63 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
64 -- , _fileParsed_result :: [Document]
65 -- } deriving (Show)
66
67
68 -- | According to the format of Input file,
69 -- different parser are available.
70 data FileFormat = WOS | RIS | RisPresse
71 | CsvGargV3 | CsvHalFormat
72 deriving (Show)
73
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
79
80
81 {-
82 parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
83 parseFormat = undefined
84 -}
85
86 -- | Parse file into documents
87 -- TODO manage errors here
88 -- TODO: to debug maybe add the filepath in error message
89 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
90 parseFile CsvHalFormat p = parseHal p
91 parseFile CsvGargV3 p = parseCsv p
92 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
93 parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
94 parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
95
96 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
97 -- TODO use language for RIS
98 toDoc ff d = do
99 let abstract = lookup "abstract" d
100 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
101
102 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
103
104 (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang dateToParse
105
106 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
107 (lookup "doi" d)
108 (lookup "URL" d)
109 Nothing
110 Nothing
111 Nothing
112 (lookup "title" d)
113 Nothing
114 (lookup "authors" d)
115 (lookup "source" d)
116 (lookup "abstract" d)
117 (fmap (DT.pack . show) utcTime)
118 (pub_year)
119 (pub_month)
120 (pub_day)
121 Nothing
122 Nothing
123 Nothing
124 (Just $ (DT.pack . show) lang)
125
126 enrichWith :: FileFormat
127 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
128 enrichWith RisPresse = enrichWith' presseEnrich
129 enrichWith WOS = enrichWith' (map (first WOS.keys))
130 enrichWith _ = enrichWith' identity
131
132
133 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
134 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
135 enrichWith' f = second (map both' . map f . concat)
136 where
137 both' = map (both decodeUtf8)
138
139 readFileWith :: FileFormat -> FilePath
140 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
141 readFileWith format path = do
142 files <- case takeExtension path of
143 ".zip" -> openZip path
144 _ -> pure <$> clean <$> DB.readFile path
145 partitionEithers <$> mapConcurrently (runParser format) files
146
147
148 -- | withParser:
149 -- According to the format of the text, choose the right parser.
150 -- TODO withParser :: FileFormat -> Parser [Document]
151 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
152 withParser WOS = WOS.parser
153 withParser RIS = RIS.parser
154 --withParser ODT = odtParser
155 --withParser XML = xmlParser
156 withParser _ = panic "[ERROR] Parser not implemented yet"
157
158 runParser :: FileFormat -> DB.ByteString
159 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
160 runParser format text = pure $ parseOnly (withParser format) text
161
162 openZip :: FilePath -> IO [DB.ByteString]
163 openZip fp = do
164 entries <- withArchive fp (DM.keys <$> getEntries)
165 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
166 pure bs
167
168 cleanText :: Text -> Text
169 cleanText = cs . clean . cs
170
171 clean :: DB.ByteString -> DB.ByteString
172 clean txt = DBC.map clean' txt
173 where
174 clean' '’' = '\''
175 clean' '\r' = ' '
176 clean' '\t' = ' '
177 clean' ';' = '.'
178 clean' c = c