src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
  26     where
  27
  28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import qualified Data.ByteString.Char8 as DBC
  32 import Data.Attoparsec.ByteString (parseOnly, Parser)
  33 import Data.Either(Either(..))
  34 import Data.Either.Extra (partitionEithers)
  35 import Data.List (concat)
  36 import Data.List (lookup)
  37 import Data.Ord()
  38 import Data.String (String())
  39 import Data.String()
  40 import Data.Text (Text)
  41 import Data.Text.Encoding (decodeUtf8)
  42 import Data.Time (UTCTime(..))
  43 import Data.Tuple.Extra (both, first, second)
  44 import System.FilePath (FilePath(), takeExtension)
  45 import qualified Data.ByteString as DB
  46 import qualified Data.Map        as DM
  47 import qualified Data.Text as DT
  48 import qualified Data.Time as DT
  49 import Gargantext.Core (Lang(..))
  50 import Gargantext.Prelude
  51 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  52 import qualified Gargantext.Text.Parsers.WOS as WOS
  53 import qualified Gargantext.Text.Parsers.RIS as RIS
  54 import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
  55 import Gargantext.Text.Parsers.Date (parseDate)
  56 import Gargantext.Text.Parsers.CSV (parseHal)
  57 import Gargantext.Text.Terms.Stop (detectLang)
  58 ------------------------------------------------------------------------
  59
  60 type ParseError = String
  61 --type Field      = Text
  62 --type Document   = DM.Map Field Text
  63 --type FilesParsed = DM.Map FilePath FileParsed
  64 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  65 --                              , _fileParsed_result :: [Document]
  66 --                              } deriving (Show)
  67
  68
  69 -- | According to the format of Input file,
  70 -- different parser are available.
  71 data FileFormat = WOS | RIS | RisPresse
  72                 | CsvGargV3 | CsvHalFormat
  73   deriving (Show)
  74
  75 -- Implemented (ISI Format)
  76 --                | DOC        -- Not Implemented / import Pandoc
  77 --                | ODT        -- Not Implemented / import Pandoc
  78 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  79 --                | XML        -- Not Implemented / see :
  80 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  81
  82 -- TODO: to debug maybe add the filepath in error message
  83
  84
  85 -- | Parse file into documents
  86 -- TODO manage errors here
  87 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
  88 parseDocs CsvHalFormat p = parseHal p
  89 parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich           <$> parse' RIS p
  90 parseDocs WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith (map (first WOS.keys)) <$> parse' WOS p
  91 parseDocs ff        p = join $ mapM (toDoc ff)  <$> snd <$> parse ff p
  92
  93 type Year  = Int
  94 type Month = Int
  95 type Day   = Int
  96
  97 -- | Parse date to Ints
  98 -- TODO add hours, minutes and seconds
  99 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
 100 parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
 101 parseDate' l (Just txt) = do
 102   utcTime <- parseDate l txt
 103   let (UTCTime day _) = utcTime
 104   let (y,m,d) = DT.toGregorian day
 105   pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
 106
 107
 108 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 109 -- TODO use language for RIS
 110 toDoc ff d = do
 111       let abstract = lookup "abstract" d
 112       let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
 113
 114       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 115
 116       (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse
 117
 118       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 119                                (lookup "doi" d)
 120                                (lookup "URL" d)
 121                                 Nothing
 122                                 Nothing
 123                                 Nothing
 124                                (lookup "title" d)
 125                                 Nothing
 126                                (lookup "authors" d)
 127                                (lookup "source" d)
 128                                (lookup "abstract" d)
 129                                (fmap (DT.pack . show) utcTime)
 130                                (pub_year)
 131                                (pub_month)
 132                                (pub_day)
 133                                Nothing
 134                                Nothing
 135                                Nothing
 136                                (Just $ (DT.pack . show) lang)
 137
 138 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
 139 parse ff fp = enrichWith identity <$> parse' ff fp
 140
 141 enrichWith ::
 142   ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 143   ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 144 enrichWith f = second (map both' . map f . concat)
 145   where
 146     both'   = map (both decodeUtf8)
 147
 148 parse' :: FileFormat -> FilePath
 149        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 150 parse' format path = do
 151     files <- case takeExtension path of
 152               ".zip" -> openZip              path
 153               _      -> pure <$> clean <$> DB.readFile path
 154     partitionEithers <$> mapConcurrently (runParser format) files
 155
 156
 157
 158 -- | withParser:
 159 -- According to the format of the text, choose the right parser.
 160 -- TODO  withParser :: FileFormat -> Parser [Document]
 161 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 162 withParser WOS = WOS.parser
 163 withParser RIS = RIS.parser
 164 --withParser ODT = odtParser
 165 --withParser XML = xmlParser
 166 withParser _   = panic "[ERROR] Parser not implemented yet"
 167
 168 runParser :: FileFormat -> DB.ByteString
 169           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 170 runParser format text = pure $ parseOnly (withParser format) text
 171
 172 openZip :: FilePath -> IO [DB.ByteString]
 173 openZip fp = do
 174     entries <- withArchive fp (DM.keys <$> getEntries)
 175     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 176     pure bs
 177
 178 clean :: DB.ByteString -> DB.ByteString
 179 clean txt = DBC.map clean' txt
 180   where
 181     clean' '’' = '\''
 182     clean' '\r' = ' '
 183     clean' c  = c
 184
 185