src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30 import Data.Time.Segment (jour)
  31
  32 import Data.Vector (Vector)
  33 import qualified Data.Vector as V
  34 import Safe (tailMay)
  35
  36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  37 import Gargantext.Text
  38 import Gargantext.Text.Context
  39 import Gargantext.Prelude hiding (length)
  40
  41 ---------------------------------------------------------------
  42 data Doc = Doc
  43     { d_docId  :: !Int
  44     , d_title  :: !Text
  45     , d_source :: !Text
  46     , d_publication_year  :: !Int
  47     , d_publication_month :: !Int
  48     , d_publication_day   :: !Int
  49     , d_abstract          :: !Text
  50     , d_authors           :: !Text
  51     }
  52     deriving (Show)
  53 ---------------------------------------------------------------
  54 -- | Doc 2 HyperdataDocument
  55 doc2hyperdataDocument :: Doc -> HyperdataDocument
  56 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  57 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  58   HyperdataDocument (Just "CSV")
  59                     (Just . pack . show $ did)
  60                     Nothing
  61                     Nothing
  62                     Nothing
  63                     Nothing
  64                     (Just dt)
  65                     (Just dau)
  66                     (Just dab)
  67                     (Nothing)
  68                     Nothing
  69                     (Just dpy)
  70                     (Just dpm)
  71                     (Just dpd)
  72                     Nothing
  73                     Nothing
  74                     Nothing
  75                     Nothing
  76 ---------------------------------------------------------------
  77 -- | Types Conversions
  78 toDocs :: Vector CsvDoc -> [Doc]
  79 toDocs v = V.toList
  80          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  81                        -> Doc nId t s py pm pd abst auth )
  82                        (V.enumFromN 1 (V.length v'')) v''
  83           where
  84             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  85             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  86
  87 ---------------------------------------------------------------
  88 fromDocs :: Vector Doc -> Vector CsvDoc
  89 fromDocs docs = V.map fromDocs' docs
  90   where
  91     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  92
  93 ---------------------------------------------------------------
  94 -- | Split a document in its context
  95 -- TODO adapt the size of the paragraph according to the corpus average
  96
  97 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  98 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
  99                           if docSize > 1000
 100                             then
 101                               if (mod (round m) docSize) >= 10
 102                                 then
 103                                   splitDoc' splt doc
 104                                 else
 105                                   V.fromList [doc]
 106                             else
 107                               V.fromList [doc]
 108
 109
 110 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 111 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 112     where
 113       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 114       firstAbstract = head' abstracts
 115
 116       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 117
 118       abstracts    = (splitBy $ contextSize) abst
 119       head' x = maybe ""   identity (head x)
 120       tail' x = maybe [""] identity (tailMay x)
 121
 122 ---------------------------------------------------------------
 123 ---------------------------------------------------------------
 124 type Mean = Double
 125
 126 docsSize :: Vector CsvDoc -> Mean
 127 docsSize csvDoc = mean ls
 128   where
 129     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 130
 131
 132 ---------------------------------------------------------------
 133 data CsvDoc = CsvDoc
 134     { csv_title  :: !Text
 135     , csv_source :: !Text
 136     , csv_publication_year  :: !Int
 137     , csv_publication_month :: !Int
 138     , csv_publication_day   :: !Int
 139     , csv_abstract          :: !Text
 140     , csv_authors           :: !Text
 141     }
 142     deriving (Show)
 143
 144 instance FromNamedRecord CsvDoc where
 145   parseNamedRecord r = CsvDoc <$> r .: "title"
 146                               <*> r .: "source"
 147                               <*> r .: "publication_year"
 148                               <*> r .: "publication_month"
 149                               <*> r .: "publication_day"
 150                               <*> r .: "abstract"
 151                               <*> r .: "authors"
 152
 153 instance ToNamedRecord CsvDoc where
 154   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 155     namedRecord [ "title"  .= t
 156                 , "source" .= s
 157                 , "publication_year"  .= py
 158                 , "publication_month" .= pm
 159                 , "publication_day"   .= pd
 160                 , "abstract"          .= abst
 161                 , "authors"           .= aut
 162                ]
 163
 164
 165 csvDecodeOptions :: DecodeOptions
 166 csvDecodeOptions = (defaultDecodeOptions
 167                       {decDelimiter = fromIntegral $ ord '\t'}
 168                     )
 169
 170 csvEncodeOptions :: EncodeOptions
 171 csvEncodeOptions = ( defaultEncodeOptions
 172                       {encDelimiter = fromIntegral $ ord '\t'}
 173                     )
 174
 175 ------------------------------------------------------------------------
 176 ------------------------------------------------------------------------
 177 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 178 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 179                       <$> snd
 180                       <$> readCsv fp
 181
 182 ------------------------------------------------------------------------
 183 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 184 readCsv fp = do
 185     csvData <- BL.readFile fp
 186     case decodeByNameWith csvDecodeOptions csvData of
 187       Left e        -> panic (pack e)
 188       Right csvDocs -> pure csvDocs
 189
 190
 191 readHal :: FilePath -> IO (Header, Vector CsvHal)
 192 readHal fp = do
 193     csvData <- BL.readFile fp
 194     case decodeByNameWith csvDecodeOptions csvData of
 195       Left e        -> panic (pack e)
 196       Right csvDocs -> pure csvDocs
 197 ------------------------------------------------------------------------
 198 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 199 writeCsv fp (h, vs) = BL.writeFile fp $
 200                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 201
 202
 203 ------------------------------------------------------------------------
 204 -- Hal Format
 205 data CsvHal = CsvHal
 206     { csvHal_title  :: !Text
 207     , csvHal_source :: !Text
 208     , csvHal_publication_year  :: !Integer
 209     , csvHal_publication_month :: !Int
 210     , csvHal_publication_day   :: !Int
 211     , csvHal_abstract          :: !Text
 212     , csvHal_authors           :: !Text
 213
 214     , csvHal_url               :: !Text
 215     , csvHal_isbn_s            :: !Text
 216     , csvHal_issue_s           :: !Text
 217     , csvHal_journalPublisher_s:: !Text
 218     , csvHal_language_s        :: !Text
 219
 220     , csvHal_doiId_s           :: !Text
 221     , csvHal_authId_i          :: !Text
 222     , csvHal_instStructId_i    :: !Text
 223     , csvHal_deptStructId_i    :: !Text
 224     , csvHal_labStructId_i     :: !Text
 225
 226     , csvHal_rteamStructId_i   :: !Text
 227     , csvHal_docType_s         :: !Text
 228     }
 229     deriving (Show)
 230
 231 instance FromNamedRecord CsvHal where
 232   parseNamedRecord r = CsvHal <$> r .: "title"
 233                               <*> r .: "source"
 234                               <*> r .: "publication_year"
 235                               <*> r .: "publication_month"
 236                               <*> r .: "publication_day"
 237                               <*> r .: "abstract"
 238                               <*> r .: "authors"
 239
 240                               <*> r .: "url"
 241                               <*> r .: "isbn_s"
 242                               <*> r .: "issue_s"
 243                               <*> r .: "journalPublisher_s"
 244                               <*> r .: "language_s"
 245
 246                               <*> r .: "doiId_s"
 247                               <*> r .: "authId_i"
 248                               <*> r .: "instStructId_i"
 249                               <*> r .: "deptStructId_i"
 250                               <*> r .: "labStructId_i"
 251
 252                               <*> r .: "rteamStructId_i"
 253                               <*> r .: "docType_s"
 254
 255 instance ToNamedRecord CsvHal where
 256   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 257     namedRecord [ "title"  .= t
 258                 , "source" .= s
 259
 260                 , "publication_year"  .= py
 261                 , "publication_month" .= pm
 262                 , "publication_day"   .= pd
 263
 264                 , "abstract"          .= abst
 265                 , "authors"           .= aut
 266
 267                 , "url"                .= url
 268                 , "isbn_s"             .= isbn
 269                 , "issue_s"            .= iss
 270                 , "journalPublisher_s" .= j
 271                 , "language_s"         .= lang
 272
 273                 , "doiId_s"            .= doi
 274                 , "authId_i"           .= auth
 275                 , "instStructId_i"     .= inst
 276                 , "deptStructId_i"     .= dept
 277                 , "labStructId_i"      .= lab
 278
 279                 , "rteamStructId_i"    .= team
 280                 , "docType_s"          .= doct
 281                ]
 282
 283 csvHal2doc :: CsvHal -> HyperdataDocument
 284 csvHal2doc (CsvHal title source
 285        pub_year pub_month pub_day
 286        abstract authors
 287        url _ _ _ _
 288        doi _ _ _ _
 289        _ _ ) = HyperdataDocument (Just "CsvHal")
 290                                (Just doi)
 291                                (Just url)
 292                                Nothing
 293                                Nothing
 294                                Nothing
 295                                (Just title)
 296                                (Just authors)
 297                                (Just source)
 298                                (Just abstract)
 299                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 300                                (Just $ fromIntegral pub_year)
 301                                (Just pub_month)
 302                                (Just pub_day)
 303                                Nothing
 304                                Nothing
 305                                Nothing
 306                                Nothing
 307
 308 ------------------------------------------------------------------------
 309 parseHal :: FilePath -> IO [HyperdataDocument]
 310 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
 311 ------------------------------------------------------------------------
 312
 313