src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30 import Data.Time.Segment (jour)
  31
  32 import Data.Vector (Vector)
  33 import qualified Data.Vector as V
  34
  35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  36 import Gargantext.Text
  37 import Gargantext.Text.Context
  38 import Gargantext.Prelude hiding (length)
  39
  40 ---------------------------------------------------------------
  41 headerCsvGargV3 :: Header
  42 headerCsvGargV3 = header [ "title"
  43          , "source"
  44          , "publication_year"
  45          , "publication_month"
  46          , "publication_day"
  47          , "abstract"
  48          , "authors"
  49          ]
  50 ---------------------------------------------------------------
  51 data Doc = Doc
  52     { d_docId  :: !Int
  53     , d_title  :: !Text
  54     , d_source :: !Text
  55     , d_publication_year  :: !Int
  56     , d_publication_month :: !Int
  57     , d_publication_day   :: !Int
  58     , d_abstract          :: !Text
  59     , d_authors           :: !Text
  60     }
  61     deriving (Show)
  62 ---------------------------------------------------------------
  63 -- | Doc 2 HyperdataDocument
  64 doc2hyperdataDocument :: Doc -> HyperdataDocument
  65 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  66 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  67   HyperdataDocument (Just "CSV")
  68                     (Just . pack . show $ did)
  69                     Nothing
  70                     Nothing
  71                     Nothing
  72                     Nothing
  73                     (Just dt)
  74                     Nothing
  75                     (Just dau)
  76                     (Just dab)
  77                     (Nothing)
  78                     Nothing
  79                     (Just dpy)
  80                     (Just dpm)
  81                     (Just dpd)
  82                     Nothing
  83                     Nothing
  84                     Nothing
  85                     Nothing
  86 ---------------------------------------------------------------
  87 -- | Types Conversions
  88 toDocs :: Vector CsvDoc -> [Doc]
  89 toDocs v = V.toList
  90          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  91                        -> Doc nId t s py pm pd abst auth )
  92                        (V.enumFromN 1 (V.length v'')) v''
  93           where
  94             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  95             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  96
  97 ---------------------------------------------------------------
  98 fromDocs :: Vector Doc -> Vector CsvDoc
  99 fromDocs docs = V.map fromDocs' docs
 100   where
 101     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
 102
 103 ---------------------------------------------------------------
 104 -- | Split a document in its context
 105 -- TODO adapt the size of the paragraph according to the corpus average
 106
 107 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 108 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 109                           if docSize > 1000
 110                             then
 111                               if (mod (round m) docSize) >= 10
 112                                 then
 113                                   splitDoc' splt doc
 114                                 else
 115                                   V.fromList [doc]
 116                             else
 117                               V.fromList [doc]
 118
 119
 120 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 121 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 122     where
 123       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 124       firstAbstract = head' "splitDoc'1" abstracts
 125
 126       nextDocs = map (\txt -> CsvDoc
 127                                 (head' "splitDoc'2" $ sentences txt)
 128                                 s py pm pd
 129                                 (unsentences $ tail' "splitDoc'1" $ sentences txt)
 130                                 auth
 131                       ) (tail' "splitDoc'2" abstracts)
 132
 133       abstracts    = (splitBy $ contextSize) abst
 134
 135 ---------------------------------------------------------------
 136 ---------------------------------------------------------------
 137 type Mean = Double
 138
 139 docsSize :: Vector CsvDoc -> Mean
 140 docsSize csvDoc = mean ls
 141   where
 142     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 143
 144
 145 ---------------------------------------------------------------
 146 data CsvDoc = CsvDoc
 147     { csv_title  :: !Text
 148     , csv_source :: !Text
 149     , csv_publication_year  :: !Int
 150     , csv_publication_month :: !Int
 151     , csv_publication_day   :: !Int
 152     , csv_abstract          :: !Text
 153     , csv_authors           :: !Text
 154     }
 155     deriving (Show)
 156
 157 instance FromNamedRecord CsvDoc where
 158   parseNamedRecord r = CsvDoc <$> r .: "title"
 159                               <*> r .: "source"
 160                               <*> r .: "publication_year"
 161                               <*> r .: "publication_month"
 162                               <*> r .: "publication_day"
 163                               <*> r .: "abstract"
 164                               <*> r .: "authors"
 165
 166 instance ToNamedRecord CsvDoc where
 167   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 168     namedRecord [ "title"  .= t
 169                 , "source" .= s
 170                 , "publication_year"  .= py
 171                 , "publication_month" .= pm
 172                 , "publication_day"   .= pd
 173                 , "abstract"          .= abst
 174                 , "authors"           .= aut
 175                ]
 176
 177
 178 csvDecodeOptions :: DecodeOptions
 179 csvDecodeOptions = (defaultDecodeOptions
 180                       {decDelimiter = fromIntegral $ ord '\t'}
 181                     )
 182
 183 csvEncodeOptions :: EncodeOptions
 184 csvEncodeOptions = ( defaultEncodeOptions
 185                       {encDelimiter = fromIntegral $ ord '\t'}
 186                     )
 187
 188 ------------------------------------------------------------------------
 189 ------------------------------------------------------------------------
 190 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 191 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 192                       <$> snd
 193                       <$> readCsv fp
 194
 195 ------------------------------------------------------------------------
 196 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 197 readCsv fp = do
 198     csvData <- BL.readFile fp
 199     case decodeByNameWith csvDecodeOptions csvData of
 200       Left e        -> panic (pack e)
 201       Right csvDocs -> pure csvDocs
 202
 203
 204 readHal :: FilePath -> IO (Header, Vector CsvHal)
 205 readHal fp = do
 206     csvData <- BL.readFile fp
 207     case decodeByNameWith csvDecodeOptions csvData of
 208       Left e        -> panic (pack e)
 209       Right csvDocs -> pure csvDocs
 210 ------------------------------------------------------------------------
 211 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 212 writeCsv fp (h, vs) = BL.writeFile fp $
 213                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 214
 215
 216 ------------------------------------------------------------------------
 217 -- Hal Format
 218 data CsvHal = CsvHal
 219     { csvHal_title  :: !Text
 220     , csvHal_source :: !Text
 221     , csvHal_publication_year  :: !Integer
 222     , csvHal_publication_month :: !Int
 223     , csvHal_publication_day   :: !Int
 224     , csvHal_abstract          :: !Text
 225     , csvHal_authors           :: !Text
 226
 227     , csvHal_url               :: !Text
 228     , csvHal_isbn_s            :: !Text
 229     , csvHal_issue_s           :: !Text
 230     , csvHal_journalPublisher_s:: !Text
 231     , csvHal_language_s        :: !Text
 232
 233     , csvHal_doiId_s           :: !Text
 234     , csvHal_authId_i          :: !Text
 235     , csvHal_instStructId_i    :: !Text
 236     , csvHal_deptStructId_i    :: !Text
 237     , csvHal_labStructId_i     :: !Text
 238
 239     , csvHal_rteamStructId_i   :: !Text
 240     , csvHal_docType_s         :: !Text
 241     }
 242     deriving (Show)
 243
 244 instance FromNamedRecord CsvHal where
 245   parseNamedRecord r = CsvHal <$> r .: "title"
 246                               <*> r .: "source"
 247                               <*> r .: "publication_year"
 248                               <*> r .: "publication_month"
 249                               <*> r .: "publication_day"
 250                               <*> r .: "abstract"
 251                               <*> r .: "authors"
 252
 253                               <*> r .: "url"
 254                               <*> r .: "isbn_s"
 255                               <*> r .: "issue_s"
 256                               <*> r .: "journalPublisher_s"
 257                               <*> r .: "language_s"
 258
 259                               <*> r .: "doiId_s"
 260                               <*> r .: "authId_i"
 261                               <*> r .: "instStructId_i"
 262                               <*> r .: "deptStructId_i"
 263                               <*> r .: "labStructId_i"
 264
 265                               <*> r .: "rteamStructId_i"
 266                               <*> r .: "docType_s"
 267
 268 instance ToNamedRecord CsvHal where
 269   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 270     namedRecord [ "title"  .= t
 271                 , "source" .= s
 272
 273                 , "publication_year"  .= py
 274                 , "publication_month" .= pm
 275                 , "publication_day"   .= pd
 276
 277                 , "abstract"          .= abst
 278                 , "authors"           .= aut
 279
 280                 , "url"                .= url
 281                 , "isbn_s"             .= isbn
 282                 , "issue_s"            .= iss
 283                 , "journalPublisher_s" .= j
 284                 , "language_s"         .= lang
 285
 286                 , "doiId_s"            .= doi
 287                 , "authId_i"           .= auth
 288                 , "instStructId_i"     .= inst
 289                 , "deptStructId_i"     .= dept
 290                 , "labStructId_i"      .= lab
 291
 292                 , "rteamStructId_i"    .= team
 293                 , "docType_s"          .= doct
 294                ]
 295
 296 csvHal2doc :: CsvHal -> HyperdataDocument
 297 csvHal2doc (CsvHal title source
 298        pub_year pub_month pub_day
 299        abstract authors
 300        url _ _ _ _
 301        doi _ inst _ _
 302        _ _ ) = HyperdataDocument (Just "CsvHal")
 303                                (Just doi)
 304                                (Just url)
 305                                Nothing
 306                                Nothing
 307                                Nothing
 308                                (Just title)
 309                                (Just authors)
 310                                (Just inst)
 311                                (Just source)
 312                                (Just abstract)
 313                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 314                                (Just $ fromIntegral pub_year)
 315                                (Just pub_month)
 316                                (Just pub_day)
 317                                Nothing
 318                                Nothing
 319                                Nothing
 320                                Nothing
 321
 322 ------------------------------------------------------------------------
 323 parseHal :: FilePath -> IO [HyperdataDocument]
 324 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
 325 ------------------------------------------------------------------------
 326
 327