]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
[DBFLOW] clean
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
26 where
27
28 import System.FilePath (FilePath(), takeExtension)
29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30
31 import Control.Monad (join)
32 import Data.Time (UTCTime(..))
33 import qualified Data.Time as DT
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat)
36 import qualified Data.Map as DM
37 import qualified Data.ByteString as DB
38 import Data.Ord()
39 import Data.String()
40 import Data.Either(Either(..))
41 import Data.Attoparsec.ByteString (parseOnly, Parser)
42
43 import Data.Text (Text)
44 import qualified Data.Text as DT
45
46 -- Activate Async for to parse in parallel
47 import Control.Concurrent.Async as CCA (mapConcurrently)
48
49 import Data.Text.Encoding (decodeUtf8)
50 import Data.String (String())
51 import Data.List (lookup)
52
53 ------------------------------------------------------------------------
54 import Gargantext.Core (Lang(..))
55 import Gargantext.Prelude
56 import Gargantext.Database.Types.Node (HyperdataDocument(..))
57 import Gargantext.Text.Parsers.WOS (wosParser)
58 import Gargantext.Text.Parsers.Date (parseDate)
59 import Gargantext.Text.Terms.Stop (detectLang)
60 ------------------------------------------------------------------------
61
62 type ParseError = String
63 --type Field = Text
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
68 -- } deriving (Show)
69
70
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileFormat = WOS
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
82
83 -- TODO: to debug maybe add the filepath in error message
84
85
86 -- | Parse file into documents
87 -- TODO manage errors here
88 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
89 parseDocs format path = do
90 docs <- snd <$> parse format path
91 mapM (toDoc format) docs
92
93 type Year = Int
94 type Month = Int
95 type Day = Int
96
97 -- | Parse date to Ints
98 -- TODO add hours, minutes and seconds
99 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
100 parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
101 parseDate' l (Just txt) = do
102 utcTime <- parseDate l txt
103 let (UTCTime day _) = utcTime
104 let (y,m,d) = DT.toGregorian day
105 pure (Just utcTime, (Just (fromIntegral y),Just m,Just d))
106
107
108 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
109 toDoc format d = do
110
111 let abstract = lookup "abstract" d
112 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
113
114 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
115
116 (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
117
118 pure $ HyperdataDocument (Just $ DT.pack $ show format)
119 (lookup "doi" d)
120 (lookup "URL" d)
121 Nothing
122 Nothing
123 (lookup "title" d)
124 (lookup "authors" d)
125 (lookup "source" d)
126 (lookup "abstract" d)
127 (fmap (DT.pack . show) utcTime)
128 (pub_year)
129 (pub_month)
130 (pub_day)
131 Nothing
132 Nothing
133 Nothing
134 (Just $ (DT.pack . show) lang)
135
136
137 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
138 parse format path = do
139 files <- case takeExtension path of
140 ".zip" -> openZip path
141 _ -> pure <$> DB.readFile path
142 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
143 pure (as, map toText $ concat bs)
144 where
145 -- TODO : decode with bayesian inference on encodings
146 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
147
148
149 -- | withParser:
150 -- According to the format of the text, choose the right parser.
151 -- TODO withParser :: FileFormat -> Parser [Document]
152 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
153 withParser WOS = wosParser
154 --withParser DOC = docParser
155 --withParser ODT = odtParser
156 --withParser XML = xmlParser
157 --withParser _ = error "[ERROR] Parser not implemented yet"
158
159 runParser :: FileFormat -> DB.ByteString
160 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
161 runParser format text = pure $ parseOnly (withParser format) text
162
163 openZip :: FilePath -> IO [DB.ByteString]
164 openZip fp = do
165 entries <- withArchive fp (DM.keys <$> getEntries)
166 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
167 pure bs
168
169 clean :: Text -> Text
170 clean txt = DT.map clean' txt
171 where
172 clean' '’' = '\''
173 clean' c = c
174
175