src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
  24     where
  25
  26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  28 import Conduit
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import Control.Monad.Trans.Control (MonadBaseControl)
  32 import Data.Attoparsec.ByteString (parseOnly, Parser)
  33 import Data.Either(Either(..))
  34 import Data.Either.Extra (partitionEithers)
  35 import Data.List (concat, lookup)
  36 import Data.Ord()
  37 import Data.String (String())
  38 import Data.String()
  39 import Data.Text (Text, intercalate, pack, unpack)
  40 import Data.Text.Encoding (decodeUtf8)
  41 import Data.Tuple.Extra (both, first, second)
  42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  43 import Gargantext.Core (Lang(..))
  44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  47 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
  48 import Gargantext.Prelude
  49 import System.FilePath (FilePath(), takeExtension)
  50 import System.IO.Temp (emptySystemTempFile)
  51 import qualified Data.ByteString       as DB
  52 import qualified Data.ByteString.Char8 as DBC
  53 import qualified Data.ByteString.Lazy  as DBL
  54 import qualified Data.Map              as DM
  55 import qualified Data.Text             as DT
  56 import qualified Data.Text as Text
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date      as Date
  58 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq  as Iramuteq
  59 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS       as RIS
  60 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS       as WOS
  61 import qualified Prelude
  62 ------------------------------------------------------------------------
  63
  64 type ParseError = String
  65 --type Field      = Text
  66 --type Document   = DM.Map Field Text
  67 --type FilesParsed = DM.Map FilePath FileParsed
  68 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  69 --                              , _fileParsed_result :: [Document]
  70 --                              } deriving (Show)
  71
  72
  73 -- | According to the format of Input file,
  74 -- different parser are available.
  75 data FileType = WOS
  76               | RIS
  77               | RisPresse
  78               | CsvGargV3
  79               | CsvHal
  80               | Iramuteq
  81   deriving (Show)
  82
  83 -- Implemented (ISI Format)
  84 --                | DOC        -- Not Implemented / import Pandoc
  85 --                | ODT        -- Not Implemented / import Pandoc
  86 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  87 --                | XML        -- Not Implemented / see :
  88
  89 parseFormatC :: MonadBaseControl IO m
  90              => FileType
  91              -> FileFormat
  92              -> DB.ByteString
  93              -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
  94 parseFormatC CsvGargV3 Plain bs = do
  95   let eParsedC = parseCsvC $ DBL.fromStrict bs
  96   case eParsedC of
  97     Left err -> pure $ Left err
  98     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  99 parseFormatC CsvHal    Plain bs = do
 100   let eParsedC = parseCsvC $ DBL.fromStrict bs
 101   case eParsedC of
 102     Left err -> pure $ Left err
 103     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
 104 parseFormatC RisPresse Plain bs = do
 105   --docs <- enrichWith RisPresse
 106   let eDocs = runParser' RisPresse bs
 107   pure $ (\docs ->
 108             ( Just $ fromIntegral $ length docs
 109             , yieldMany docs
 110               .| mapC presseEnrich
 111               .| mapC (map $ both decodeUtf8)
 112               .| mapMC (toDoc RIS)) ) <$> eDocs
 113 parseFormatC WOS Plain bs = do
 114   let eDocs = runParser' WOS bs
 115   pure $ (\docs ->
 116             ( Just $ fromIntegral $ length docs
 117             , yieldMany docs
 118               .| mapC (map $ first WOS.keys)
 119               .| mapC (map $ both decodeUtf8)
 120               .| mapMC (toDoc WOS)) ) <$> eDocs
 121
 122 parseFormatC Iramuteq Plain bs = do
 123   let eDocs = runParser' Iramuteq bs
 124   pure $ (\docs ->
 125             ( Just $ fromIntegral $ length docs
 126             , yieldMany docs
 127               .| mapC (map $ first Iramuteq.keys)
 128               .| mapC (map $ both decodeUtf8)
 129               .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " ")))) ))<$> eDocs
 130
 131 parseFormatC ft ZIP bs = do
 132   path <- liftBase $ emptySystemTempFile "parsed-zip"
 133   liftBase $ DB.writeFile path bs
 134   fileContents <- liftBase $ withArchive path $ do
 135     files <- DM.keys <$> getEntries
 136     mapM getEntry files
 137   --printDebug "[parseFormatC] fileContents" fileContents
 138   eContents <- mapM (parseFormatC ft Plain) fileContents
 139   --printDebug "[parseFormatC] contents" contents
 140   --pure $ Left $ "Not implemented for ZIP"
 141   let (errs, contents) = partitionEithers eContents
 142   case errs of
 143     [] ->
 144       case contents of
 145         [] -> pure $ Left "No files in zip"
 146         _  -> do
 147           let lenghts = fst <$> contents
 148           let contents' = snd <$> contents
 149           let totalLength = sum $ sum <$> lenghts  -- Trick: sum (Just 1) = 1, sum Nothing = 0
 150           pure $ Right ( Just totalLength
 151                        , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 152     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 153
 154 parseFormatC _ _ _ = undefined
 155
 156 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 157 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 158 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 159 -- parseFormat RisPresse bs = do
 160 --   docs <- mapM (toDoc RIS)
 161 --           <$> snd
 162 --           <$> enrichWith RisPresse
 163 --           $ partitionEithers
 164 --           $ [runParser'  RisPresse bs]
 165 --   pure $ Right docs
 166 -- parseFormat WOS bs = do
 167 --   docs <- mapM (toDoc WOS)
 168 --           <$> snd
 169 --           <$> enrichWith WOS
 170 --           $ partitionEithers
 171 --           $ [runParser'  WOS bs]
 172 --   pure $ Right docs
 173 -- parseFormat ZIP bs = do
 174 --   path <- emptySystemTempFile "parsed-zip"
 175 --   DB.writeFile path bs
 176 --   parsedZip <- withArchive path $ do
 177 --     DM.keys <$> getEntries
 178 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 179 -- parseFormat _ _ = undefined
 180
 181 -- | Parse file into documents
 182 -- TODO manage errors here
 183 -- TODO: to debug maybe add the filepath in error message
 184
 185 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 186 parseFile CsvHal    Plain p = parseHal p
 187 parseFile CsvGargV3 Plain p = parseCsv p
 188
 189 parseFile RisPresse Plain p = do
 190   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 191   pure $ Right docs
 192
 193 parseFile WOS       Plain p = do
 194   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 195   pure $ Right docs
 196
 197 parseFile Iramuteq       Plain p = do
 198   docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
 199               <$> snd
 200               <$> enrichWith Iramuteq
 201               <$> readFileWith Iramuteq p
 202   pure $ Right docs
 203
 204
 205 parseFile ff        _ p = do
 206   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 207   pure $ Right docs
 208
 209 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 210 -- TODO use language for RIS
 211 toDoc ff d = do
 212       -- let abstract = lookup "abstract" d
 213       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 214
 215       let dateToParse = DT.replace " " "" <$> lookup "PY" d  -- <> Just " " <> lookup "publication_date" d
 216       -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
 217       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
 218
 219       let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 220                              , _hd_doi = lookup "doi" d
 221                              , _hd_url = lookup "URL" d
 222                              , _hd_uniqId = Nothing
 223                              , _hd_uniqIdBdd = Nothing
 224                              , _hd_page = Nothing
 225                              , _hd_title = lookup "title" d
 226                              , _hd_authors = lookup "authors" d
 227                              , _hd_institutes = lookup "institutes" d
 228                              , _hd_source = lookup "source" d
 229                              , _hd_abstract = lookup "abstract" d
 230                              , _hd_publication_date = fmap (DT.pack . show) utcTime
 231                              , _hd_publication_year = pub_year
 232                              , _hd_publication_month = pub_month
 233                              , _hd_publication_day = pub_day
 234                              , _hd_publication_hour = Nothing
 235                              , _hd_publication_minute = Nothing
 236                              , _hd_publication_second = Nothing
 237                              , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 238       -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
 239       pure hd
 240
 241 enrichWith :: FileType
 242            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 243 enrichWith RisPresse = enrichWith' presseEnrich
 244 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 245 enrichWith Iramuteq  = enrichWith' (map (first Iramuteq.keys))
 246 enrichWith _         = enrichWith' identity
 247
 248
 249 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 250            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 251 enrichWith' f = second (map both' . map f . concat)
 252   where
 253     both'   = map (both decodeUtf8)
 254
 255
 256
 257 readFileWith :: FileType -> FilePath
 258        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 259 readFileWith format path = do
 260     files <- case takeExtension path of
 261               ".zip" -> openZip              path
 262               _      -> pure <$> clean <$> DB.readFile path
 263     partitionEithers <$> mapConcurrently (runParser format) files
 264
 265
 266 -- | withParser:
 267 -- According to the format of the text, choose the right parser.
 268 -- TODO  withParser :: FileType -> Parser [Document]
 269 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 270 withParser WOS      = WOS.parser
 271 withParser RIS      = RIS.parser
 272 withParser Iramuteq = Iramuteq.parser
 273 --withParser ODT = odtParser
 274 --withParser XML = xmlParser
 275 withParser _   = panic "[ERROR] Parser not implemented yet"
 276
 277 runParser :: FileType -> DB.ByteString
 278           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 279 runParser format text = pure $ runParser' format text
 280
 281 runParser' :: FileType -> DB.ByteString
 282           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 283 runParser' format text = parseOnly (withParser format) text
 284
 285 openZip :: FilePath -> IO [DB.ByteString]
 286 openZip fp = do
 287     entries <- withArchive fp (DM.keys <$> getEntries)
 288     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 289     pure bs
 290
 291 cleanText :: Text -> Text
 292 cleanText = cs . clean . cs
 293
 294 clean :: DB.ByteString -> DB.ByteString
 295 clean txt = DBC.map clean' txt
 296   where
 297     clean' '’' = '\''
 298     clean' '\r' = ' '
 299     clean' '\t' = ' '
 300     clean' ';' = '.'
 301     clean' c  = c
 302
 303 --
 304
 305 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
 306 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
 307 splitOn _ _                  = (DT.splitOn ", ")
 308