src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
  26     where
  27
  28 import System.FilePath (FilePath(), takeExtension)
  29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  30
  31 import Control.Monad (join)
  32 import qualified Data.Time as DT
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.Time (UTCTime(..))
  35 import Data.List (concat)
  36 import qualified Data.Map        as DM
  37 import qualified Data.ByteString as DB
  38 import Data.Ord()
  39 import Data.String()
  40 import Data.Either(Either(..))
  41 import Data.Attoparsec.ByteString (parseOnly, Parser)
  42
  43 import Data.Text (Text)
  44 import qualified Data.Text as DT
  45
  46 -- Activate Async for to parse in parallel
  47 import Control.Concurrent.Async as CCA (mapConcurrently)
  48
  49 import Data.Text.Encoding (decodeUtf8)
  50 import Data.String (String())
  51 import Data.List (lookup)
  52
  53 ------------------------------------------------------------------------
  54 import Gargantext.Core (Lang(..))
  55 import Gargantext.Prelude
  56 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  57 import Gargantext.Text.Parsers.WOS (wosParser)
  58 import Gargantext.Text.Parsers.RIS (risParser)
  59 import Gargantext.Text.Parsers.Date (parseDate)
  60 import Gargantext.Text.Parsers.CSV (parseHal)
  61 import Gargantext.Text.Terms.Stop (detectLang)
  62 ------------------------------------------------------------------------
  63
  64 type ParseError = String
  65 --type Field      = Text
  66 --type Document   = DM.Map Field Text
  67 --type FilesParsed = DM.Map FilePath FileParsed
  68 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  69 --                              , _fileParsed_result :: [Document]
  70 --                              } deriving (Show)
  71
  72
  73 -- | According to the format of Input file,
  74 -- different parser are available.
  75 data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
  76   deriving (Show)
  77
  78 -- Implemented (ISI Format)
  79 --                | DOC        -- Not Implemented / import Pandoc
  80 --                | ODT        -- Not Implemented / import Pandoc
  81 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  82 --                | XML        -- Not Implemented / see :
  83 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  84
  85 -- TODO: to debug maybe add the filepath in error message
  86
  87
  88 -- | Parse file into documents
  89 -- TODO manage errors here
  90 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
  91 parseDocs ff    path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
  92 parseDocs CsvHalFormat p = parseHal p
  93
  94 type Year  = Int
  95 type Month = Int
  96 type Day   = Int
  97
  98 -- | Parse date to Ints
  99 -- TODO add hours, minutes and seconds
 100 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
 101 parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
 102 parseDate' l (Just txt) = do
 103   utcTime <- parseDate l txt
 104   let (UTCTime day _) = utcTime
 105   let (y,m,d) = DT.toGregorian day
 106   pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
 107
 108
 109 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 110 -- TODO use language for RIS
 111 toDoc ff d = do
 112       let abstract = lookup "abstract" d
 113       let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
 114
 115       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 116
 117       (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse
 118
 119       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 120                                (lookup "doi" d)
 121                                (lookup "URL" d)
 122                                 Nothing
 123                                 Nothing
 124                                 Nothing
 125                                (lookup "title" d)
 126                                 Nothing
 127                                (lookup "authors" d)
 128                                (lookup "source" d)
 129                                (lookup "abstract" d)
 130                                (fmap (DT.pack . show) utcTime)
 131                                (pub_year)
 132                                (pub_month)
 133                                (pub_day)
 134                                Nothing
 135                                Nothing
 136                                Nothing
 137                                (Just $ (DT.pack . show) lang)
 138 toDoc _ _ = undefined
 139
 140 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
 141 parse format path = do
 142     files <- case takeExtension path of
 143               ".zip" -> openZip              path
 144               _      -> pure <$> DB.readFile path
 145     (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
 146     pure (as, map toText $ concat bs)
 147       where
 148         -- TODO : decode with bayesian inference on encodings
 149         toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
 150
 151
 152 -- | withParser:
 153 -- According to the format of the text, choose the right parser.
 154 -- TODO  withParser :: FileFormat -> Parser [Document]
 155 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 156 withParser WOS = wosParser
 157 withParser RIS = risParser
 158 --withParser ODT = odtParser
 159 --withParser XML = xmlParser
 160 withParser _   = panic "[ERROR] Parser not implemented yet"
 161
 162 runParser :: FileFormat -> DB.ByteString
 163           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 164 runParser format text = pure $ parseOnly (withParser format) text
 165
 166 openZip :: FilePath -> IO [DB.ByteString]
 167 openZip fp = do
 168     entries <- withArchive fp (DM.keys <$> getEntries)
 169     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 170     pure bs
 171
 172 clean :: Text -> Text
 173 clean txt = DT.map clean' txt
 174   where
 175     clean' '’' = '\''
 176     clean' c  = c
 177