src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs, risPress2csv)
  26     where
  27
  28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import Data.Attoparsec.ByteString (parseOnly, Parser)
  32 import Data.Either(Either(..))
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.List (concat)
  35 import Data.List (lookup)
  36 import Data.Ord()
  37 import Data.String (String())
  38 import Data.String()
  39 import Data.Text (Text)
  40 import Data.Text.Encoding (decodeUtf8)
  41 import Data.Time (UTCTime(..))
  42 import Data.Tuple.Extra (both, second)
  43 import System.FilePath (FilePath(), takeExtension)
  44 import qualified Data.ByteString as DB
  45 import qualified Data.Map        as DM
  46 import qualified Data.Text as DT
  47 import qualified Data.Time as DT
  48
  49 ------------------------------------------------------------------------
  50 import Gargantext.Core (Lang(..))
  51 import Gargantext.Prelude
  52 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  53 import Gargantext.Text.Parsers.WOS (wosParser)
  54 import Gargantext.Text.Parsers.RIS (risParser)
  55 import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
  56 import Gargantext.Text.Parsers.Date (parseDate)
  57 import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
  58 import Gargantext.Text.Terms.Stop (detectLang)
  59 ------------------------------------------------------------------------
  60
  61 type ParseError = String
  62 --type Field      = Text
  63 --type Document   = DM.Map Field Text
  64 --type FilesParsed = DM.Map FilePath FileParsed
  65 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  66 --                              , _fileParsed_result :: [Document]
  67 --                              } deriving (Show)
  68
  69
  70 -- | According to the format of Input file,
  71 -- different parser are available.
  72 data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
  73   deriving (Show)
  74
  75 -- Implemented (ISI Format)
  76 --                | DOC        -- Not Implemented / import Pandoc
  77 --                | ODT        -- Not Implemented / import Pandoc
  78 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  79 --                | XML        -- Not Implemented / see :
  80 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  81
  82 -- TODO: to debug maybe add the filepath in error message
  83
  84
  85 -- | Parse file into documents
  86 -- TODO manage errors here
  87 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
  88 parseDocs CsvHalFormat p = parseHal p
  89 parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$>  parse' RIS p
  90 parseDocs ff    path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
  91
  92 type Year  = Int
  93 type Month = Int
  94 type Day   = Int
  95
  96 -- | Parse date to Ints
  97 -- TODO add hours, minutes and seconds
  98 parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
  99 parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
 100 parseDate' l (Just txt) = do
 101   utcTime <- parseDate l txt
 102   let (UTCTime day _) = utcTime
 103   let (y,m,d) = DT.toGregorian day
 104   pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
 105
 106
 107 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 108 -- TODO use language for RIS
 109 toDoc ff d = do
 110       let abstract = lookup "abstract" d
 111       let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
 112
 113       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 114
 115       (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse
 116
 117       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 118                                (lookup "doi" d)
 119                                (lookup "URL" d)
 120                                 Nothing
 121                                 Nothing
 122                                 Nothing
 123                                (lookup "title" d)
 124                                 Nothing
 125                                (lookup "authors" d)
 126                                (lookup "source" d)
 127                                (lookup "abstract" d)
 128                                (fmap (DT.pack . show) utcTime)
 129                                (pub_year)
 130                                (pub_month)
 131                                (pub_day)
 132                                Nothing
 133                                Nothing
 134                                Nothing
 135                                (Just $ (DT.pack . show) lang)
 136
 137 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
 138 parse ff fp = enrichWith identity <$> parse' ff fp
 139
 140 enrichWith ::
 141   ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 142   ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 143 enrichWith f = second (map both' . map f . concat)
 144   where
 145     both'   = map (both decodeUtf8)
 146
 147 parse' :: FileFormat -> FilePath
 148        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 149 parse' format path = do
 150     files <- case takeExtension path of
 151               ".zip" -> openZip              path
 152               _      -> pure <$> DB.readFile path
 153     partitionEithers <$> mapConcurrently (runParser format) files
 154
 155
 156
 157 -- | withParser:
 158 -- According to the format of the text, choose the right parser.
 159 -- TODO  withParser :: FileFormat -> Parser [Document]
 160 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 161 withParser WOS = wosParser
 162 withParser RIS = risParser
 163 --withParser ODT = odtParser
 164 --withParser XML = xmlParser
 165 withParser _   = panic "[ERROR] Parser not implemented yet"
 166
 167 runParser :: FileFormat -> DB.ByteString
 168           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 169 runParser format text = pure $ parseOnly (withParser format) text
 170
 171 openZip :: FilePath -> IO [DB.ByteString]
 172 openZip fp = do
 173     entries <- withArchive fp (DM.keys <$> getEntries)
 174     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 175     pure bs
 176
 177 clean :: Text -> Text
 178 clean txt = DT.map clean' txt
 179   where
 180     clean' '’' = '\''
 181     clean' c  = c
 182
 183
 184
 185 risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
 186