src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
  26     where
  27
  28 import System.FilePath (FilePath(), takeExtension)
  29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  30
  31 import Control.Monad (join)
  32 import qualified Data.Time as DT
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.Time (UTCTime(..))
  35 import Data.List (concat)
  36 import qualified Data.Map        as DM
  37 import qualified Data.ByteString as DB
  38 import Data.Ord()
  39 import Data.String()
  40 import Data.Either(Either(..))
  41 import Data.Attoparsec.ByteString (parseOnly, Parser)
  42
  43 import Data.Text (Text)
  44 import qualified Data.Text as DT
  45
  46 -- Activate Async for to parse in parallel
  47 import Control.Concurrent.Async as CCA (mapConcurrently)
  48
  49 import Data.Text.Encoding (decodeUtf8)
  50 import Data.String (String())
  51 import Data.List (lookup)
  52
  53 ------------------------------------------------------------------------
  54 import Gargantext.Core (Lang(..))
  55 import Gargantext.Prelude
  56 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  57 import Gargantext.Text.Parsers.WOS (wosParser)
  58 import Gargantext.Text.Parsers.Date (parseDate)
  59 import Gargantext.Text.Parsers.CSV (parseHal)
  60 import Gargantext.Text.Terms.Stop (detectLang)
  61 ------------------------------------------------------------------------
  62
  63 type ParseError = String
  64 --type Field      = Text
  65 --type Document   = DM.Map Field Text
  66 --type FilesParsed = DM.Map FilePath FileParsed
  67 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  68 --                              , _fileParsed_result :: [Document]
  69 --                              } deriving (Show)
  70
  71
  72 -- | According to the format of Input file,
  73 -- different parser are available.
  74 data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
  75   deriving (Show)
  76
  77 -- Implemented (ISI Format)
  78 --                | DOC        -- Not Implemented / import Pandoc
  79 --                | ODT        -- Not Implemented / import Pandoc
  80 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  81 --                | XML        -- Not Implemented / see :
  82 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  83
  84 -- TODO: to debug maybe add the filepath in error message
  85
  86
  87 -- | Parse file into documents
  88 -- TODO manage errors here
  89 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
  90 parseDocs WOS    path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
  91 parseDocs CsvHalFormat p = parseHal p
  92
  93 type Year  = Int
  94 type Month = Int
  95 type Day   = Int
  96
  97 -- | Parse date to Ints
  98 -- TODO add hours, minutes and seconds
  99 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
 100 parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
 101 parseDate' l (Just txt) = do
 102   utcTime <- parseDate l txt
 103   let (UTCTime day _) = utcTime
 104   let (y,m,d) = DT.toGregorian day
 105   pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
 106
 107
 108 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 109 toDoc WOS d = do
 110       let abstract = lookup "abstract" d
 111       let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
 112
 113       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 114
 115       (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse
 116
 117       pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
 118                                (lookup "doi" d)
 119                                (lookup "URL" d)
 120                                 Nothing
 121                                 Nothing
 122                                 Nothing
 123                                (lookup "title" d)
 124                                 Nothing
 125                                (lookup "authors" d)
 126                                (lookup "source" d)
 127                                (lookup "abstract" d)
 128                                (fmap (DT.pack . show) utcTime)
 129                                (pub_year)
 130                                (pub_month)
 131                                (pub_day)
 132                                Nothing
 133                                Nothing
 134                                Nothing
 135                                (Just $ (DT.pack . show) lang)
 136 toDoc _ _ = undefined
 137
 138 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
 139 parse format path = do
 140     files <- case takeExtension path of
 141               ".zip" -> openZip              path
 142               _      -> pure <$> DB.readFile path
 143     (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
 144     pure (as, map toText $ concat bs)
 145       where
 146         -- TODO : decode with bayesian inference on encodings
 147         toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
 148
 149
 150 -- | withParser:
 151 -- According to the format of the text, choose the right parser.
 152 -- TODO  withParser :: FileFormat -> Parser [Document]
 153 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 154 withParser WOS = wosParser
 155 --withParser DOC = docParser
 156 --withParser ODT = odtParser
 157 --withParser XML = xmlParser
 158 withParser _   = panic "[ERROR] Parser not implemented yet"
 159
 160 runParser :: FileFormat -> DB.ByteString
 161           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 162 runParser format text = pure $ parseOnly (withParser format) text
 163
 164 openZip :: FilePath -> IO [DB.ByteString]
 165 openZip fp = do
 166     entries <- withArchive fp (DM.keys <$> getEntries)
 167     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 168     pure bs
 169
 170 clean :: Text -> Text
 171 clean txt = DT.map clean' txt
 172   where
 173     clean' '’' = '\''
 174     clean' c  = c
 175