src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
  24     where
  25
  26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  28 import Conduit
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import Control.Monad.Trans.Control (MonadBaseControl)
  32 import Data.Attoparsec.ByteString (parseOnly, Parser)
  33 import Data.Either(Either(..))
  34 import Data.Either.Extra (partitionEithers)
  35 import Data.List (concat, lookup)
  36 import Data.Ord()
  37 import Data.String (String())
  38 import Data.String()
  39 import Data.Text (Text, intercalate, pack, unpack)
  40 import Data.Text.Encoding (decodeUtf8)
  41 import Data.Tuple.Extra (both, first, second)
  42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  43 import Gargantext.Core (Lang(..))
  44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  45 import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
  46 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  47 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  48 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
  49 import Gargantext.Prelude
  50 import System.FilePath (FilePath(), takeExtension)
  51 import System.IO.Temp (emptySystemTempFile)
  52 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
  53 import qualified Data.ByteString       as DB
  54 import qualified Data.ByteString.Char8 as DBC
  55 import qualified Data.ByteString.Lazy  as DBL
  56 import qualified Data.Map              as DM
  57 import qualified Data.Text             as DT
  58 import qualified Data.Text as Text
  59 import qualified Gargantext.Core.Text.Corpus.Parsers.Date      as Date
  60 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq  as Iramuteq
  61 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS       as RIS
  62 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS       as WOS
  63 import qualified Prelude
  64 ------------------------------------------------------------------------
  65
  66 type ParseError = String
  67 --type Field      = Text
  68 --type Document   = DM.Map Field Text
  69 --type FilesParsed = DM.Map FilePath FileParsed
  70 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  71 --                              , _fileParsed_result :: [Document]
  72 --                              } deriving (Show)
  73
  74
  75 -- | According to the format of Input file,
  76 -- different parser are available.
  77 data FileType = WOS
  78               | RIS
  79               | RisPresse
  80               | CsvGargV3
  81               | CsvHal
  82               | Iramuteq
  83               | JSON
  84   deriving (Show, Eq)
  85
  86 -- Implemented (ISI Format)
  87 --                | DOC        -- Not Implemented / import Pandoc
  88 --                | ODT        -- Not Implemented / import Pandoc
  89 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  90 --                | XML        -- Not Implemented / see :
  91
  92 parseFormatC :: MonadBaseControl IO m
  93              => FileType
  94              -> FileFormat
  95              -> DB.ByteString
  96              -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
  97 parseFormatC CsvGargV3 Plain bs = do
  98   let eParsedC = parseCsvC $ DBL.fromStrict bs
  99   case eParsedC of
 100     Left err -> pure $ Left err
 101     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
 102 parseFormatC CsvHal    Plain bs = do
 103   let eParsedC = parseCsvC $ DBL.fromStrict bs
 104   case eParsedC of
 105     Left err -> pure $ Left err
 106     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
 107 parseFormatC RisPresse Plain bs = do
 108   --docs <- enrichWith RisPresse
 109   let eDocs = runParser' RisPresse bs
 110   pure $ (\docs ->
 111             ( Just $ fromIntegral $ length docs
 112             , yieldMany docs
 113               .| mapC presseEnrich
 114               .| mapC (map $ both decodeUtf8)
 115               .| mapMC (toDoc RIS)) ) <$> eDocs
 116 parseFormatC WOS Plain bs = do
 117   let eDocs = runParser' WOS bs
 118   pure $ (\docs ->
 119             ( Just $ fromIntegral $ length docs
 120             , yieldMany docs
 121               .| mapC (map $ first WOS.keys)
 122               .| mapC (map $ both decodeUtf8)
 123               .| mapMC (toDoc WOS)) ) <$> eDocs
 124
 125 parseFormatC Iramuteq Plain bs = do
 126   let eDocs = runParser' Iramuteq bs
 127   pure $ (\docs ->
 128             ( Just $ fromIntegral $ length docs
 129             , yieldMany docs
 130               .| mapC (map $ first Iramuteq.keys)
 131               .| mapC (map $ both decodeUtf8)
 132               .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
 133             )
 134          )
 135               <$> eDocs
 136
 137 parseFormatC JSON    Plain bs = do
 138   let eParsedC = parseJSONC $ DBL.fromStrict bs
 139   case eParsedC of
 140     Left err -> pure $ Left err
 141     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
 142
 143 parseFormatC ft ZIP bs = do
 144   path <- liftBase $ emptySystemTempFile "parsed-zip"
 145   liftBase $ DB.writeFile path bs
 146   fileContents <- liftBase $ withArchive path $ do
 147     files <- DM.keys <$> getEntries
 148     mapM getEntry files
 149   --printDebug "[parseFormatC] fileContents" fileContents
 150   eContents <- mapM (parseFormatC ft Plain) fileContents
 151   --printDebug "[parseFormatC] contents" contents
 152   --pure $ Left $ "Not implemented for ZIP"
 153   let (errs, contents) = partitionEithers eContents
 154   case errs of
 155     [] ->
 156       case contents of
 157         [] -> pure $ Left "No files in zip"
 158         _  -> do
 159           let lenghts = fst <$> contents
 160           let contents' = snd <$> contents
 161           let totalLength = sum $ sum <$> lenghts  -- Trick: sum (Just 1) = 1, sum Nothing = 0
 162           pure $ Right ( Just totalLength
 163                        , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 164     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 165
 166 parseFormatC _ _ _ = undefined
 167
 168
 169 etale :: [HyperdataDocument] -> [HyperdataDocument]
 170 etale = concat . (map etale')
 171   where
 172     etale' :: HyperdataDocument -> [HyperdataDocument]
 173     etale' h = map (\t -> h { _hd_abstract = Just t })
 174             $ map snd
 175             $ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
 176
 177
 178 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 179 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 180 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 181 -- parseFormat RisPresse bs = do
 182 --   docs <- mapM (toDoc RIS)
 183 --           <$> snd
 184 --           <$> enrichWith RisPresse
 185 --           $ partitionEithers
 186 --           $ [runParser'  RisPresse bs]
 187 --   pure $ Right docs
 188 -- parseFormat WOS bs = do
 189 --   docs <- mapM (toDoc WOS)
 190 --           <$> snd
 191 --           <$> enrichWith WOS
 192 --           $ partitionEithers
 193 --           $ [runParser'  WOS bs]
 194 --   pure $ Right docs
 195 -- parseFormat ZIP bs = do
 196 --   path <- emptySystemTempFile "parsed-zip"
 197 --   DB.writeFile path bs
 198 --   parsedZip <- withArchive path $ do
 199 --     DM.keys <$> getEntries
 200 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 201 -- parseFormat _ _ = undefined
 202
 203 -- | Parse file into documents
 204 -- TODO manage errors here
 205 -- TODO: to debug maybe add the filepath in error message
 206
 207 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 208 parseFile CsvHal    Plain p = parseHal p
 209 parseFile CsvGargV3 Plain p = parseCsv p
 210
 211 parseFile RisPresse Plain p = do
 212   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 213   pure $ Right docs
 214
 215 parseFile WOS       Plain p = do
 216   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 217   pure $ Right docs
 218
 219 parseFile Iramuteq       Plain p = do
 220   docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
 221               <$> snd
 222               <$> enrichWith Iramuteq
 223               <$> readFileWith Iramuteq p
 224   pure $ Right docs
 225
 226
 227 parseFile ff        _ p = do
 228   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 229   pure $ Right docs
 230
 231 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 232 -- TODO use language for RIS
 233 toDoc ff d = do
 234       -- let abstract = lookup "abstract" d
 235       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 236
 237       let dateToParse = DT.replace " " "" <$> lookup "PY" d  -- <> Just " " <> lookup "publication_date" d
 238       -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
 239       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
 240
 241       let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 242                              , _hd_doi = lookup "doi" d
 243                              , _hd_url = lookup "URL" d
 244                              , _hd_uniqId = Nothing
 245                              , _hd_uniqIdBdd = Nothing
 246                              , _hd_page = Nothing
 247                              , _hd_title = lookup "title" d
 248                              , _hd_authors = lookup "authors" d
 249                              , _hd_institutes = lookup "institutes" d
 250                              , _hd_source = lookup "source" d
 251                              , _hd_abstract = lookup "abstract" d
 252                              , _hd_publication_date = fmap (DT.pack . show) utcTime
 253                              , _hd_publication_year = pub_year
 254                              , _hd_publication_month = pub_month
 255                              , _hd_publication_day = pub_day
 256                              , _hd_publication_hour = Nothing
 257                              , _hd_publication_minute = Nothing
 258                              , _hd_publication_second = Nothing
 259                              , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 260       -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
 261       pure hd
 262
 263 enrichWith :: FileType
 264            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 265 enrichWith RisPresse = enrichWith' presseEnrich
 266 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 267 enrichWith Iramuteq  = enrichWith' (map (first Iramuteq.keys))
 268 enrichWith _         = enrichWith' identity
 269
 270
 271 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 272            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 273 enrichWith' f = second (map both' . map f . concat)
 274   where
 275     both'   = map (both decodeUtf8)
 276
 277
 278
 279 readFileWith :: FileType -> FilePath
 280        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 281 readFileWith format path = do
 282     files <- case takeExtension path of
 283               ".zip" -> openZip              path
 284               _      -> pure <$> clean <$> DB.readFile path
 285     partitionEithers <$> mapConcurrently (runParser format) files
 286
 287
 288 -- | withParser:
 289 -- According to the format of the text, choose the right parser.
 290 -- TODO  withParser :: FileType -> Parser [Document]
 291 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 292 withParser WOS      = WOS.parser
 293 withParser RIS      = RIS.parser
 294 withParser Iramuteq = Iramuteq.parser
 295 --withParser ODT = odtParser
 296 --withParser XML = xmlParser
 297 withParser _   = panic "[ERROR] Parser not implemented yet"
 298
 299 runParser :: FileType -> DB.ByteString
 300           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 301 runParser format text = pure $ runParser' format text
 302
 303 runParser' :: FileType -> DB.ByteString
 304           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 305 runParser' format text = parseOnly (withParser format) text
 306
 307 openZip :: FilePath -> IO [DB.ByteString]
 308 openZip fp = do
 309     entries <- withArchive fp (DM.keys <$> getEntries)
 310     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 311     pure bs
 312
 313 cleanText :: Text -> Text
 314 cleanText = cs . clean . cs
 315
 316 clean :: DB.ByteString -> DB.ByteString
 317 clean txt = DBC.map clean' txt
 318   where
 319     clean' '’' = '\''
 320     clean' '\r' = ' '
 321     clean' '\t' = ' '
 322     clean' ';' = '.'
 323     clean' c  = c
 324
 325 --
 326
 327 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
 328 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
 329 splitOn _ _                  = (DT.splitOn ", ")