2 Module : Gargantext.Core.Text.Corpus.Parsers.Book
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Get Book into GarganText
14 module Gargantext.Core.Text.Corpus.Parsers.Book
18 import Data.Text (Text)
19 import GHC.IO (FilePath)
20 import Gargantext.Core (Lang(..))
21 import Gargantext.Core.Text.Corpus.Parsers.CSV (hyperdataDocument2csv)
22 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
23 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
24 import Gargantext.Prelude
25 import System.Directory -- (getDirectoryContents)
26 import qualified Data.ByteString.Lazy as DBL
27 import qualified Data.List as List
28 import qualified Data.Text as DT
30 ------------------------------------------------------------------------
31 -- Main Export Function
33 type FileOut = FilePath
35 publi2csv :: Int -> FileDir -> FileOut -> IO ()
36 publi2csv n f_in f_out = do
38 texts <- readPublis f_in files
39 let publis = List.concat $ map (file2publi n) texts
40 let docs = map (\(y,p) -> publiToHyperdata y p) $ List.zip [1..] publis
41 DBL.writeFile f_out (hyperdataDocument2csv docs)
43 filesOf :: FileDir -> IO [FilePath]
44 filesOf fd = List.sort -- sort by filename
45 <$> List.filter (\f -> DT.length (cs f) > 2)
46 <$> getDirectoryContents fd
48 readPublis :: FileDir -> [FilePath] -> IO [(FilePath, Text)]
49 readPublis fd fps = mapM (\fp -> DBL.readFile (fd <> fp) >>= \txt -> pure (fp, cs txt)) fps
51 ------------------------------------------------------------------------
53 data Publi = Publi { publi_authors :: [Text]
54 , publi_source :: Text
60 data FileInfo = FileInfo { fi_authors :: [Text]
65 type FileDir = FilePath
66 ---------------------------------------------------------------------
68 file2publi :: Int -> (FilePath, Text) -> [Publi]
69 file2publi n (fp,theText) = map (\(t,txt) -> Publi authors source t txt) theTexts
71 theTexts = text2titleParagraphs n theText
72 FileInfo authors source = fileNameInfo fp
74 fileNameInfo :: FilePath -> FileInfo
75 fileNameInfo fp = toFileInfo xs
77 xs = DT.splitOn "_" $ DT.pack fp
78 toFileInfo (a:b:_) = FileInfo (DT.splitOn "-and-" a) (cs b)
79 toFileInfo _ = panic "error"
81 ---------------------------------------------------------------------
82 publiToHyperdata :: Int -> Publi -> HyperdataDocument
83 publiToHyperdata y (Publi a s t txt) =
84 HyperdataDocument { _hd_bdd = Just "Book File"
87 , _hd_uniqId = Nothing
88 , _hd_uniqIdBdd = Nothing
91 , _hd_authors = Just (DT.concat a)
92 , _hd_institutes = Nothing
94 , _hd_abstract = Just txt
95 , _hd_publication_date = Nothing
96 , _hd_publication_year = Just y
97 , _hd_publication_month = Just 1
98 , _hd_publication_day = Just 1
99 , _hd_publication_hour = Nothing
100 , _hd_publication_minute = Nothing
101 , _hd_publication_second = Nothing
102 , _hd_language_iso2 = Just $ DT.pack $ show FR
105 -------------------------------------------------------------
106 -- MISC tool to remove urls for instance
107 clean :: Text -> Text
108 clean = DT.unwords . List.filter (\w -> DT.length w < 20) . DT.words