]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Book.hs
Add more Bool Query Engine tests
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Book.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Book
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Get Book into GarganText
11
12 -}
13
14 module Gargantext.Core.Text.Corpus.Parsers.Book
15 where
16
17 import Data.Maybe
18 import Data.Text (Text)
19 import GHC.IO (FilePath)
20 import Gargantext.Core (Lang(..))
21 import Gargantext.Core.Text.Corpus.Parsers.CSV (hyperdataDocument2csv)
22 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
23 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
24 import Gargantext.Prelude
25 import System.Directory -- (getDirectoryContents)
26 import qualified Data.ByteString.Lazy as DBL
27 import qualified Data.List as List
28 import qualified Data.Text as DT
29
30 ------------------------------------------------------------------------
31 -- Main Export Function
32
33 type FileOut = FilePath
34
35 book2csv :: Int -> FileDir -> FileOut -> IO ()
36 book2csv n f_in f_out = do
37 files <- filesOf f_in
38 texts <- readPublis f_in files
39 let publis = List.concat $ map (file2publi n) texts
40 let docs = map (\(y,p) -> publiToHyperdata y p) $ List.zip [1..] publis
41 DBL.writeFile f_out (hyperdataDocument2csv docs)
42
43 filesOf :: FileDir -> IO [FilePath]
44 filesOf fd = List.sort -- sort by filename
45 <$> List.filter (\f -> DT.length (cs f) > 2)
46 <$> getDirectoryContents fd
47
48 readPublis :: FileDir -> [FilePath] -> IO [(FilePath, Text)]
49 readPublis fd fps = mapM (\fp -> DBL.readFile (fd <> fp) >>= \txt -> pure (fp, cs txt)) fps
50
51 ------------------------------------------------------------------------
52 -- Main Types
53 data Publi = Publi { publi_authors :: [Text]
54 , publi_source :: Text
55 , publi_title :: Text
56 , publi_text :: Text
57 }
58 deriving (Show)
59
60 data FileInfo = FileInfo { fi_authors :: [Text]
61 , fi_source :: Text
62 }
63 deriving (Show)
64
65 type FileDir = FilePath
66 ---------------------------------------------------------------------
67
68 file2publi :: Int -> (FilePath, Text) -> [Publi]
69 file2publi n (fp,theText) = map (\(t,txt) -> Publi authors source t txt) theTexts
70 where
71 theTexts = text2titleParagraphs n theText
72 FileInfo authors source = fileNameInfo fp
73
74 fileNameInfo :: FilePath -> FileInfo
75 fileNameInfo fp = toFileInfo xs
76 where
77 xs = DT.splitOn "_" $ DT.pack fp
78 toFileInfo (a:b:_) = FileInfo (DT.splitOn "-and-" a) (cs b)
79 toFileInfo _ = panic "error"
80
81 ---------------------------------------------------------------------
82 publiToHyperdata :: Int -> Publi -> HyperdataDocument
83 publiToHyperdata y (Publi a s t txt) =
84 HyperdataDocument { _hd_bdd = Just "Book File"
85 , _hd_doi = Nothing
86 , _hd_url = Nothing
87 , _hd_uniqId = Nothing
88 , _hd_uniqIdBdd = Nothing
89 , _hd_page = Nothing
90 , _hd_title = Just t
91 , _hd_authors = Just (DT.concat a)
92 , _hd_institutes = Nothing
93 , _hd_source = Just s
94 , _hd_abstract = Just txt
95 , _hd_publication_date = Nothing
96 , _hd_publication_year = Just y
97 , _hd_publication_month = Just 1
98 , _hd_publication_day = Just 1
99 , _hd_publication_hour = Nothing
100 , _hd_publication_minute = Nothing
101 , _hd_publication_second = Nothing
102 , _hd_language_iso2 = Just $ DT.pack $ show FR
103 }
104
105 -------------------------------------------------------------
106 -- MISC tool to remove urls for instance
107 clean :: Text -> Text
108 clean = DT.unwords . List.filter (\w -> DT.length w < 20) . DT.words