[FIX] rdf lib.

[gargantext.git] / src / Gargantext / Text / Parsers.hs
diff --git a/src/Gargantext/Text/Parsers.hs b/src/Gargantext/Text/Parsers.hs

index aa34caef7451940ef2ca2541af5ecc02157ec34f..ac8f449ed1d10bc7cadb49d78023a79cab784304 100644 (file)
--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -19,53 +19,62 @@ please follow the types.
  -}
  
  {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE PackageImports    #-}
+{-# LANGUAGE OverloadedStrings #-}
  
-module Gargantext.Text.Parsers -- (parse, FileFormat(..))
+module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
      where
  
-import Gargantext.Prelude
+import System.FilePath (FilePath(), takeExtension)
+import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  
-import System.FilePath (takeExtension, FilePath())
-import Data.Attoparsec.ByteString (parseOnly, Parser)
-import qualified Data.ByteString as DB
-import qualified Data.Map        as DM
+import Control.Monad (join)
+import qualified Data.Time as DT
  import Data.Either.Extra (partitionEithers)
+import Data.Time (UTCTime(..))
+import Data.List (concat)
+import qualified Data.Map        as DM
+import qualified Data.ByteString as DB
  import Data.Ord()
-import Data.Foldable (concat)
  import Data.String()
-import Data.Either.Extra(Either())
+import Data.Either(Either(..))
+import Data.Attoparsec.ByteString (parseOnly, Parser)
  
  import Data.Text (Text)
-import Data.Text.Encoding (decodeUtf8)
-----
---import Control.Monad (join)
-import Codec.Archive.Zip (withArchive, getEntry, getEntries)
-import Path.IO (resolveFile')
------- import qualified Data.ByteString.Lazy as B
---import Control.Applicative ( (<$>) )
+import qualified Data.Text as DT
+
+-- Activate Async for to parse in parallel
  import Control.Concurrent.Async as CCA (mapConcurrently)
  
+import Data.Text.Encoding (decodeUtf8)
  import Data.String (String())
-import Gargantext.Text.Parsers.WOS (wosParser)
----- import Gargantext.Parsers.XML (xmlParser)
----- import Gargantext.Parsers.DOC (docParser)
----- import Gargantext.Parsers.ODT (odtParser)
+import Data.List (lookup)
  
---import Gargantext.Prelude (pm)
---import Gargantext.Types.Main (ErrorMessage(), Corpus)
+------------------------------------------------------------------------
+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude
+import Gargantext.Database.Types.Node (HyperdataDocument(..))
+import Gargantext.Text.Parsers.WOS (wosParser)
+import Gargantext.Text.Parsers.Date (parseDate)
+import Gargantext.Text.Parsers.CSV (parseHal)
+import Gargantext.Text.Terms.Stop (detectLang)
+------------------------------------------------------------------------
  
--- FIXME
---type Field = Text
  type ParseError = String
---
---data Corpus = Corpus { _corpusErrors :: [ParseError]
---                     , _corpusMap    :: Map FilePath (Map Field Text)
---                    }
+--type Field      = Text
+--type Document   = DM.Map Field Text
+--type FilesParsed = DM.Map FilePath FileParsed
+--data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
+--                              , _fileParsed_result :: [Document]
+--                              } deriving (Show)
  
  
  -- | According to the format of Input file,
  -- different parser are available.
-data FileFormat = WOS        -- Implemented (ISI Format)
+data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
+  deriving (Show)
+
+-- Implemented (ISI Format)
  --                | DOC        -- Not Implemented / import Pandoc
  --                | ODT        -- Not Implemented / import Pandoc
  --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
@@ -75,6 +84,57 @@ data FileFormat = WOS        -- Implemented (ISI Format)
  -- TODO: to debug maybe add the filepath in error message
  
  
+-- | Parse file into documents
+-- TODO manage errors here
+parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
+parseDocs WOS    path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
+parseDocs CsvHalFormat p = parseHal p
+
+type Year  = Int
+type Month = Int
+type Day   = Int
+
+-- | Parse date to Ints
+-- TODO add hours, minutes and seconds
+parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
+parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
+parseDate' l (Just txt) = do
+  utcTime <- parseDate l txt
+  let (UTCTime day _) = utcTime
+  let (y,m,d) = DT.toGregorian day
+  pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
+
+
+toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
+toDoc WOS d = do
+      let abstract = lookup "abstract" d
+      let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
+      
+      let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
+      
+      (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse
+
+      pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
+                               (lookup "doi" d)
+                               (lookup "URL" d)
+                                Nothing
+                                Nothing
+                                Nothing
+                               (lookup "title" d)
+                                Nothing
+                               (lookup "authors" d)
+                               (lookup "source" d)
+                               (lookup "abstract" d)
+                               (fmap (DT.pack . show) utcTime)
+                               (pub_year)
+                               (pub_month)
+                               (pub_day)
+                               Nothing
+                               Nothing
+                               Nothing
+                               (Just $ (DT.pack . show) lang)
+toDoc _ _ = undefined
+
  parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
  parse format path = do
      files <- case takeExtension path of
@@ -88,24 +148,28 @@ parse format path = do
  
  
  -- | withParser:
--- According the format of the text, choosing the right parser.
+-- According to the format of the text, choose the right parser.
  -- TODO  withParser :: FileFormat -> Parser [Document]
  withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  withParser WOS = wosParser
  --withParser DOC = docParser
  --withParser ODT = odtParser
  --withParser XML = xmlParser
---withParser _   = error "[ERROR] Parser not implemented yet"
+withParser _   = panic "[ERROR] Parser not implemented yet"
  
-runParser :: FileFormat -> DB.ByteString 
+runParser :: FileFormat -> DB.ByteString
            -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
  runParser format text = pure $ parseOnly (withParser format) text
  
  openZip :: FilePath -> IO [DB.ByteString]
  openZip fp = do
-    path    <- resolveFile' fp
-    entries <- withArchive path (DM.keys <$> getEntries)
-    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
+    entries <- withArchive fp (DM.keys <$> getEntries)
+    bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
      pure bs
  
+clean :: Text -> Text
+clean txt = DT.map clean' txt
+  where
+    clean' '’' = '\''
+    clean' c  = c