]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
26 where
27
28 import System.FilePath (FilePath(), takeExtension)
29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30
31 import Control.Monad (join)
32 import qualified Data.Time as DT
33 import Data.Either.Extra (partitionEithers)
34 import Data.Time (UTCTime(..))
35 import Data.List (concat)
36 import qualified Data.Map as DM
37 import qualified Data.ByteString as DB
38 import Data.Ord()
39 import Data.String()
40 import Data.Either(Either(..))
41 import Data.Attoparsec.ByteString (parseOnly, Parser)
42
43 import Data.Text (Text)
44 import qualified Data.Text as DT
45
46 -- Activate Async for to parse in parallel
47 import Control.Concurrent.Async as CCA (mapConcurrently)
48
49 import Data.Text.Encoding (decodeUtf8)
50 import Data.String (String())
51 import Data.List (lookup)
52
53 ------------------------------------------------------------------------
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Prelude
56 import Gargantext.Database.Types.Node (HyperdataDocument(..))
57 import Gargantext.Text.Parsers.WOS (wosParser)
58 import Gargantext.Text.Parsers.Date (parseDate)
59 import Gargantext.Text.Parsers.CSV (parseHal)
60 import Gargantext.Text.Terms.Stop (detectLang)
61 ------------------------------------------------------------------------
62
63 type ParseError = String
64 --type Field = Text
65 --type Document = DM.Map Field Text
66 --type FilesParsed = DM.Map FilePath FileParsed
67 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
68 -- , _fileParsed_result :: [Document]
69 -- } deriving (Show)
70
71
72 -- | According to the format of Input file,
73 -- different parser are available.
74 data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
75 deriving (Show)
76
77 -- Implemented (ISI Format)
78 -- | DOC -- Not Implemented / import Pandoc
79 -- | ODT -- Not Implemented / import Pandoc
80 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
81 -- | XML -- Not Implemented / see :
82 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
83
84 -- TODO: to debug maybe add the filepath in error message
85
86
87 -- | Parse file into documents
88 -- TODO manage errors here
89 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
90 parseDocs WOS path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
91 parseDocs CsvHalFormat p = parseHal p
92
93 type Year = Int
94 type Month = Int
95 type Day = Int
96
97 -- | Parse date to Ints
98 -- TODO add hours, minutes and seconds
99 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
100 parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
101 parseDate' l (Just txt) = do
102 utcTime <- parseDate l txt
103 let (UTCTime day _) = utcTime
104 let (y,m,d) = DT.toGregorian day
105 pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
106
107
108 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
109 toDoc WOS d = do
110 let abstract = lookup "abstract" d
111 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
112
113 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
114
115 (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
116
117 pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
118 (lookup "doi" d)
119 (lookup "URL" d)
120 Nothing
121 Nothing
122 Nothing
123 (lookup "title" d)
124 Nothing
125 (lookup "authors" d)
126 (lookup "source" d)
127 (lookup "abstract" d)
128 (fmap (DT.pack . show) utcTime)
129 (pub_year)
130 (pub_month)
131 (pub_day)
132 Nothing
133 Nothing
134 Nothing
135 (Just $ (DT.pack . show) lang)
136 toDoc _ _ = undefined
137
138 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
139 parse format path = do
140 files <- case takeExtension path of
141 ".zip" -> openZip path
142 _ -> pure <$> DB.readFile path
143 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
144 pure (as, map toText $ concat bs)
145 where
146 -- TODO : decode with bayesian inference on encodings
147 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
148
149
150 -- | withParser:
151 -- According to the format of the text, choose the right parser.
152 -- TODO withParser :: FileFormat -> Parser [Document]
153 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
154 withParser WOS = wosParser
155 --withParser DOC = docParser
156 --withParser ODT = odtParser
157 --withParser XML = xmlParser
158 withParser _ = panic "[ERROR] Parser not implemented yet"
159
160 runParser :: FileFormat -> DB.ByteString
161 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
162 runParser format text = pure $ parseOnly (withParser format) text
163
164 openZip :: FilePath -> IO [DB.ByteString]
165 openZip fp = do
166 entries <- withArchive fp (DM.keys <$> getEntries)
167 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
168 pure bs
169
170 clean :: Text -> Text
171 clean txt = DT.map clean' txt
172 where
173 clean' '’' = '\''
174 clean' c = c
175