src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
  24     where
  25
  26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  28 import Conduit
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import Control.Monad.Trans.Control (MonadBaseControl)
  32 import Data.Attoparsec.ByteString (parseOnly, Parser)
  33 import Data.Either(Either(..))
  34 import Data.Either.Extra (partitionEithers)
  35 import Data.List (concat, lookup)
  36 import Data.Ord()
  37 import Data.String (String())
  38 import Data.String()
  39 import Data.Text (Text, intercalate, pack, unpack)
  40 import Data.Text.Encoding (decodeUtf8)
  41 import Data.Tuple.Extra (both, first, second)
  42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  43 import Gargantext.Core (Lang(..))
  44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  47 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
  48 import Gargantext.Prelude
  49 import System.FilePath (FilePath(), takeExtension)
  50 import System.IO.Temp (emptySystemTempFile)
  51 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
  52 import qualified Data.ByteString       as DB
  53 import qualified Data.ByteString.Char8 as DBC
  54 import qualified Data.ByteString.Lazy  as DBL
  55 import qualified Data.Map              as DM
  56 import qualified Data.Text             as DT
  57 import qualified Data.Text as Text
  58 import qualified Gargantext.Core.Text.Corpus.Parsers.Date      as Date
  59 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq  as Iramuteq
  60 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS       as RIS
  61 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS       as WOS
  62 import qualified Prelude
  63 ------------------------------------------------------------------------
  64
  65 type ParseError = String
  66 --type Field      = Text
  67 --type Document   = DM.Map Field Text
  68 --type FilesParsed = DM.Map FilePath FileParsed
  69 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  70 --                              , _fileParsed_result :: [Document]
  71 --                              } deriving (Show)
  72
  73
  74 -- | According to the format of Input file,
  75 -- different parser are available.
  76 data FileType = WOS
  77               | RIS
  78               | RisPresse
  79               | CsvGargV3
  80               | CsvHal
  81               | Iramuteq
  82   deriving (Show)
  83
  84 -- Implemented (ISI Format)
  85 --                | DOC        -- Not Implemented / import Pandoc
  86 --                | ODT        -- Not Implemented / import Pandoc
  87 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  88 --                | XML        -- Not Implemented / see :
  89
  90 parseFormatC :: MonadBaseControl IO m
  91              => FileType
  92              -> FileFormat
  93              -> DB.ByteString
  94              -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
  95 parseFormatC CsvGargV3 Plain bs = do
  96   let eParsedC = parseCsvC $ DBL.fromStrict bs
  97   case eParsedC of
  98     Left err -> pure $ Left err
  99     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
 100 parseFormatC CsvHal    Plain bs = do
 101   let eParsedC = parseCsvC $ DBL.fromStrict bs
 102   case eParsedC of
 103     Left err -> pure $ Left err
 104     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
 105 parseFormatC RisPresse Plain bs = do
 106   --docs <- enrichWith RisPresse
 107   let eDocs = runParser' RisPresse bs
 108   pure $ (\docs ->
 109             ( Just $ fromIntegral $ length docs
 110             , yieldMany docs
 111               .| mapC presseEnrich
 112               .| mapC (map $ both decodeUtf8)
 113               .| mapMC (toDoc RIS)) ) <$> eDocs
 114 parseFormatC WOS Plain bs = do
 115   let eDocs = runParser' WOS bs
 116   pure $ (\docs ->
 117             ( Just $ fromIntegral $ length docs
 118             , yieldMany docs
 119               .| mapC (map $ first WOS.keys)
 120               .| mapC (map $ both decodeUtf8)
 121               .| mapMC (toDoc WOS)) ) <$> eDocs
 122
 123 parseFormatC Iramuteq Plain bs = do
 124   let eDocs = runParser' Iramuteq bs
 125   pure $ (\docs ->
 126             ( Just $ fromIntegral $ length docs
 127             , yieldMany docs
 128               .| mapC (map $ first Iramuteq.keys)
 129               .| mapC (map $ both decodeUtf8)
 130               .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
 131             )
 132          )
 133               <$> eDocs
 134
 135 parseFormatC ft ZIP bs = do
 136   path <- liftBase $ emptySystemTempFile "parsed-zip"
 137   liftBase $ DB.writeFile path bs
 138   fileContents <- liftBase $ withArchive path $ do
 139     files <- DM.keys <$> getEntries
 140     mapM getEntry files
 141   --printDebug "[parseFormatC] fileContents" fileContents
 142   eContents <- mapM (parseFormatC ft Plain) fileContents
 143   --printDebug "[parseFormatC] contents" contents
 144   --pure $ Left $ "Not implemented for ZIP"
 145   let (errs, contents) = partitionEithers eContents
 146   case errs of
 147     [] ->
 148       case contents of
 149         [] -> pure $ Left "No files in zip"
 150         _  -> do
 151           let lenghts = fst <$> contents
 152           let contents' = snd <$> contents
 153           let totalLength = sum $ sum <$> lenghts  -- Trick: sum (Just 1) = 1, sum Nothing = 0
 154           pure $ Right ( Just totalLength
 155                        , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 156     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 157
 158 parseFormatC _ _ _ = undefined
 159
 160
 161 etale :: [HyperdataDocument] -> [HyperdataDocument]
 162 etale = concat . (map etale')
 163   where
 164     etale' :: HyperdataDocument -> [HyperdataDocument]
 165     etale' h = map (\t -> h { _hd_abstract = Just t })
 166             $ map snd
 167             $ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
 168
 169
 170 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 171 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 172 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 173 -- parseFormat RisPresse bs = do
 174 --   docs <- mapM (toDoc RIS)
 175 --           <$> snd
 176 --           <$> enrichWith RisPresse
 177 --           $ partitionEithers
 178 --           $ [runParser'  RisPresse bs]
 179 --   pure $ Right docs
 180 -- parseFormat WOS bs = do
 181 --   docs <- mapM (toDoc WOS)
 182 --           <$> snd
 183 --           <$> enrichWith WOS
 184 --           $ partitionEithers
 185 --           $ [runParser'  WOS bs]
 186 --   pure $ Right docs
 187 -- parseFormat ZIP bs = do
 188 --   path <- emptySystemTempFile "parsed-zip"
 189 --   DB.writeFile path bs
 190 --   parsedZip <- withArchive path $ do
 191 --     DM.keys <$> getEntries
 192 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 193 -- parseFormat _ _ = undefined
 194
 195 -- | Parse file into documents
 196 -- TODO manage errors here
 197 -- TODO: to debug maybe add the filepath in error message
 198
 199 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 200 parseFile CsvHal    Plain p = parseHal p
 201 parseFile CsvGargV3 Plain p = parseCsv p
 202
 203 parseFile RisPresse Plain p = do
 204   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 205   pure $ Right docs
 206
 207 parseFile WOS       Plain p = do
 208   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 209   pure $ Right docs
 210
 211 parseFile Iramuteq       Plain p = do
 212   docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
 213               <$> snd
 214               <$> enrichWith Iramuteq
 215               <$> readFileWith Iramuteq p
 216   pure $ Right docs
 217
 218
 219 parseFile ff        _ p = do
 220   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 221   pure $ Right docs
 222
 223 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 224 -- TODO use language for RIS
 225 toDoc ff d = do
 226       -- let abstract = lookup "abstract" d
 227       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 228
 229       let dateToParse = DT.replace " " "" <$> lookup "PY" d  -- <> Just " " <> lookup "publication_date" d
 230       -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
 231       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
 232
 233       let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 234                              , _hd_doi = lookup "doi" d
 235                              , _hd_url = lookup "URL" d
 236                              , _hd_uniqId = Nothing
 237                              , _hd_uniqIdBdd = Nothing
 238                              , _hd_page = Nothing
 239                              , _hd_title = lookup "title" d
 240                              , _hd_authors = lookup "authors" d
 241                              , _hd_institutes = lookup "institutes" d
 242                              , _hd_source = lookup "source" d
 243                              , _hd_abstract = lookup "abstract" d
 244                              , _hd_publication_date = fmap (DT.pack . show) utcTime
 245                              , _hd_publication_year = pub_year
 246                              , _hd_publication_month = pub_month
 247                              , _hd_publication_day = pub_day
 248                              , _hd_publication_hour = Nothing
 249                              , _hd_publication_minute = Nothing
 250                              , _hd_publication_second = Nothing
 251                              , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 252       -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
 253       pure hd
 254
 255 enrichWith :: FileType
 256            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 257 enrichWith RisPresse = enrichWith' presseEnrich
 258 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 259 enrichWith Iramuteq  = enrichWith' (map (first Iramuteq.keys))
 260 enrichWith _         = enrichWith' identity
 261
 262
 263 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 264            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 265 enrichWith' f = second (map both' . map f . concat)
 266   where
 267     both'   = map (both decodeUtf8)
 268
 269
 270
 271 readFileWith :: FileType -> FilePath
 272        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 273 readFileWith format path = do
 274     files <- case takeExtension path of
 275               ".zip" -> openZip              path
 276               _      -> pure <$> clean <$> DB.readFile path
 277     partitionEithers <$> mapConcurrently (runParser format) files
 278
 279
 280 -- | withParser:
 281 -- According to the format of the text, choose the right parser.
 282 -- TODO  withParser :: FileType -> Parser [Document]
 283 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 284 withParser WOS      = WOS.parser
 285 withParser RIS      = RIS.parser
 286 withParser Iramuteq = Iramuteq.parser
 287 --withParser ODT = odtParser
 288 --withParser XML = xmlParser
 289 withParser _   = panic "[ERROR] Parser not implemented yet"
 290
 291 runParser :: FileType -> DB.ByteString
 292           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 293 runParser format text = pure $ runParser' format text
 294
 295 runParser' :: FileType -> DB.ByteString
 296           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 297 runParser' format text = parseOnly (withParser format) text
 298
 299 openZip :: FilePath -> IO [DB.ByteString]
 300 openZip fp = do
 301     entries <- withArchive fp (DM.keys <$> getEntries)
 302     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 303     pure bs
 304
 305 cleanText :: Text -> Text
 306 cleanText = cs . clean . cs
 307
 308 clean :: DB.ByteString -> DB.ByteString
 309 clean txt = DBC.map clean' txt
 310   where
 311     clean' '’' = '\''
 312     clean' '\r' = ' '
 313     clean' '\t' = ' '
 314     clean' ';' = '.'
 315     clean' c  = c
 316
 317 --
 318
 319 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
 320 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
 321 splitOn _ _                  = (DT.splitOn ", ")
 322