src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
  24     where
  25
  26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  28 import Conduit
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import Control.Monad.Trans.Control (MonadBaseControl)
  32 import Data.Attoparsec.ByteString (parseOnly, Parser)
  33 import Data.Either(Either(..))
  34 import Data.Either.Extra (partitionEithers)
  35 import Data.List (concat, lookup)
  36 import Data.Ord()
  37 import Data.String (String())
  38 import Data.String()
  39 import Data.Text (Text, intercalate, pack, unpack)
  40 import Data.Text.Encoding (decodeUtf8)
  41 import Data.Tuple.Extra (both, first, second)
  42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  43 import Gargantext.Core (Lang(..))
  44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  47 import Gargantext.Prelude
  48 import System.FilePath (FilePath(), takeExtension)
  49 import System.IO.Temp (emptySystemTempFile)
  50 import qualified Data.ByteString       as DB
  51 import qualified Data.ByteString.Char8 as DBC
  52 import qualified Data.ByteString.Lazy  as DBL
  53 import qualified Data.Map              as DM
  54 import qualified Data.Text             as DT
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  58 import qualified Prelude
  59 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
  60 ------------------------------------------------------------------------
  61
  62 type ParseError = String
  63 --type Field      = Text
  64 --type Document   = DM.Map Field Text
  65 --type FilesParsed = DM.Map FilePath FileParsed
  66 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  67 --                              , _fileParsed_result :: [Document]
  68 --                              } deriving (Show)
  69
  70
  71 -- | According to the format of Input file,
  72 -- different parser are available.
  73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
  74   deriving (Show)
  75
  76 -- Implemented (ISI Format)
  77 --                | DOC        -- Not Implemented / import Pandoc
  78 --                | ODT        -- Not Implemented / import Pandoc
  79 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  80 --                | XML        -- Not Implemented / see :
  81
  82 parseFormatC :: MonadBaseControl IO m
  83              => FileType
  84              -> FileFormat
  85              -> DB.ByteString
  86              -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
  87 parseFormatC CsvGargV3 Plain bs = do
  88   let eParsedC = parseCsvC $ DBL.fromStrict bs
  89   case eParsedC of
  90     Left err -> pure $ Left err
  91     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  92 parseFormatC CsvHal    Plain bs = do
  93   let eParsedC = parseCsvC $ DBL.fromStrict bs
  94   case eParsedC of
  95     Left err -> pure $ Left err
  96     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  97 parseFormatC RisPresse Plain bs = do
  98   --docs <- enrichWith RisPresse
  99   let eDocs = runParser' RisPresse bs
 100   pure $ (\docs ->
 101             ( Just $ fromIntegral $ length docs
 102             , yieldMany docs
 103               .| mapC presseEnrich
 104               .| mapC (map $ both decodeUtf8)
 105               .| mapMC (toDoc RIS)) ) <$> eDocs
 106 parseFormatC WOS Plain bs = do
 107   let eDocs = runParser' WOS bs
 108   pure $ (\docs ->
 109             ( Just $ fromIntegral $ length docs
 110             , yieldMany docs
 111               .| mapC (map $ first WOS.keys)
 112               .| mapC (map $ both decodeUtf8)
 113               .| mapMC (toDoc WOS)) ) <$> eDocs
 114 parseFormatC ft ZIP bs = do
 115   path <- liftBase $ emptySystemTempFile "parsed-zip"
 116   liftBase $ DB.writeFile path bs
 117   fileContents <- liftBase $ withArchive path $ do
 118     files <- DM.keys <$> getEntries
 119     mapM getEntry files
 120   --printDebug "[parseFormatC] fileContents" fileContents
 121   eContents <- mapM (parseFormatC ft Plain) fileContents
 122   --printDebug "[parseFormatC] contents" contents
 123   --pure $ Left $ "Not implemented for ZIP"
 124   let (errs, contents) = partitionEithers eContents
 125   case errs of
 126     [] ->
 127       case contents of
 128         [] -> pure $ Left "No files in zip"
 129         _  -> do
 130           let lenghts = fst <$> contents
 131           let contents' = snd <$> contents
 132           let totalLength = sum $ sum <$> lenghts  -- Trick: sum (Just 1) = 1, sum Nothing = 0
 133           pure $ Right ( Just totalLength
 134                        , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 135     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 136
 137 parseFormatC _ _ _ = undefined
 138
 139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 141 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 142 -- parseFormat RisPresse bs = do
 143 --   docs <- mapM (toDoc RIS)
 144 --           <$> snd
 145 --           <$> enrichWith RisPresse
 146 --           $ partitionEithers
 147 --           $ [runParser'  RisPresse bs]
 148 --   pure $ Right docs
 149 -- parseFormat WOS bs = do
 150 --   docs <- mapM (toDoc WOS)
 151 --           <$> snd
 152 --           <$> enrichWith WOS
 153 --           $ partitionEithers
 154 --           $ [runParser'  WOS bs]
 155 --   pure $ Right docs
 156 -- parseFormat ZIP bs = do
 157 --   path <- emptySystemTempFile "parsed-zip"
 158 --   DB.writeFile path bs
 159 --   parsedZip <- withArchive path $ do
 160 --     DM.keys <$> getEntries
 161 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 162 -- parseFormat _ _ = undefined
 163
 164 -- | Parse file into documents
 165 -- TODO manage errors here
 166 -- TODO: to debug maybe add the filepath in error message
 167
 168 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 169 parseFile CsvHal    Plain p = parseHal p
 170 parseFile CsvGargV3 Plain p = parseCsv p
 171
 172 parseFile RisPresse Plain p = do
 173   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 174   pure $ Right docs
 175
 176 parseFile WOS       Plain p = do
 177   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 178   pure $ Right docs
 179
 180 parseFile ff        _ p = do
 181   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 182   pure $ Right docs
 183
 184 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 185 -- TODO use language for RIS
 186 toDoc ff d = do
 187       -- let abstract = lookup "abstract" d
 188       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 189
 190       let dateToParse = DT.replace " " "" <$> lookup "PY" d  -- <> Just " " <> lookup "publication_date" d
 191       printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
 192       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
 193
 194       let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 195                              , _hd_doi = lookup "doi" d
 196                              , _hd_url = lookup "URL" d
 197                              , _hd_uniqId = Nothing
 198                              , _hd_uniqIdBdd = Nothing
 199                              , _hd_page = Nothing
 200                              , _hd_title = lookup "title" d
 201                              , _hd_authors = lookup "authors" d
 202                              , _hd_institutes = lookup "institutes" d
 203                              , _hd_source = lookup "source" d
 204                              , _hd_abstract = lookup "abstract" d
 205                              , _hd_publication_date = fmap (DT.pack . show) utcTime
 206                              , _hd_publication_year = pub_year
 207                              , _hd_publication_month = pub_month
 208                              , _hd_publication_day = pub_day
 209                              , _hd_publication_hour = Nothing
 210                              , _hd_publication_minute = Nothing
 211                              , _hd_publication_second = Nothing
 212                              , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 213       printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
 214       pure hd
 215
 216 enrichWith :: FileType
 217            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 218 enrichWith RisPresse = enrichWith' presseEnrich
 219 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 220 enrichWith _         = enrichWith' identity
 221
 222
 223 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 224            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 225 enrichWith' f = second (map both' . map f . concat)
 226   where
 227     both'   = map (both decodeUtf8)
 228
 229
 230
 231 readFileWith :: FileType -> FilePath
 232        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 233 readFileWith format path = do
 234     files <- case takeExtension path of
 235               ".zip" -> openZip              path
 236               _      -> pure <$> clean <$> DB.readFile path
 237     partitionEithers <$> mapConcurrently (runParser format) files
 238
 239
 240 -- | withParser:
 241 -- According to the format of the text, choose the right parser.
 242 -- TODO  withParser :: FileType -> Parser [Document]
 243 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 244 withParser WOS = WOS.parser
 245 withParser RIS = RIS.parser
 246 --withParser ODT = odtParser
 247 --withParser XML = xmlParser
 248 withParser _   = panic "[ERROR] Parser not implemented yet"
 249
 250 runParser :: FileType -> DB.ByteString
 251           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 252 runParser format text = pure $ runParser' format text
 253
 254 runParser' :: FileType -> DB.ByteString
 255           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 256 runParser' format text = parseOnly (withParser format) text
 257
 258 openZip :: FilePath -> IO [DB.ByteString]
 259 openZip fp = do
 260     entries <- withArchive fp (DM.keys <$> getEntries)
 261     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 262     pure bs
 263
 264 cleanText :: Text -> Text
 265 cleanText = cs . clean . cs
 266
 267 clean :: DB.ByteString -> DB.ByteString
 268 clean txt = DBC.map clean' txt
 269   where
 270     clean' '’' = '\''
 271     clean' '\r' = ' '
 272     clean' '\t' = ' '
 273     clean' ';' = '.'
 274     clean' c  = c
 275
 276 --
 277
 278 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
 279 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
 280 splitOn _ _                  = (DT.splitOn ", ")
 281