]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[MERGE] bashql + pipeline.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
38
39 ---------------------------------------------------------------
40 data Doc = Doc
41 { d_docId :: !Int
42 , d_title :: !Text
43 , d_source :: !Text
44 , d_publication_year :: !Int
45 , d_publication_month :: !Int
46 , d_publication_day :: !Int
47 , d_abstract :: !Text
48 , d_authors :: !Text
49 }
50 deriving (Show)
51 ---------------------------------------------------------------
52 toDocs :: Vector CsvDoc -> [Doc]
53 toDocs v = V.toList
54 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
55 -> Doc nId t s py pm pd abst auth )
56 (V.enumFromN 1 (V.length v'')) v''
57 where
58 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
59 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
60
61 ---------------------------------------------------------------
62 fromDocs :: Vector Doc -> Vector CsvDoc
63 fromDocs docs = V.map fromDocs' docs
64 where
65 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
66
67 ---------------------------------------------------------------
68 -- | Split a document in its context
69 -- TODO adapt the size of the paragraph according to the corpus average
70
71 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
72 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
73 if docSize > 1000
74 then
75 if (mod (round m) docSize) >= 10
76 then
77 splitDoc' splt doc
78 else
79 V.fromList [doc]
80 else
81 V.fromList [doc]
82
83
84 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
85 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
86 where
87 firstDoc = CsvDoc t s py pm pd firstAbstract auth
88 firstAbstract = head' abstracts
89
90 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
91
92 abstracts = (splitBy $ contextSize) abst
93 head' x = maybe "" identity (head x)
94 tail' x = maybe [""] identity (tailMay x)
95
96 ---------------------------------------------------------------
97 ---------------------------------------------------------------
98 type Mean = Double
99
100 docsSize :: Vector CsvDoc -> Mean
101 docsSize csvDoc = mean ls
102 where
103 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
104
105
106 ---------------------------------------------------------------
107 data CsvDoc = CsvDoc
108 { csv_title :: !Text
109 , csv_source :: !Text
110 , csv_publication_year :: !Int
111 , csv_publication_month :: !Int
112 , csv_publication_day :: !Int
113 , csv_abstract :: !Text
114 , csv_authors :: !Text
115 }
116 deriving (Show)
117
118 instance FromNamedRecord CsvDoc where
119 parseNamedRecord r = CsvDoc <$> r .: "title"
120 <*> r .: "source"
121 <*> r .: "publication_year"
122 <*> r .: "publication_month"
123 <*> r .: "publication_day"
124 <*> r .: "abstract"
125 <*> r .: "authors"
126
127 instance ToNamedRecord CsvDoc where
128 toNamedRecord (CsvDoc t s py pm pd abst aut) =
129 namedRecord [ "title" .= t
130 , "source" .= s
131 , "publication_year" .= py
132 , "publication_month" .= pm
133 , "publication_day" .= pd
134 , "abstract" .= abst
135 , "authors" .= aut
136 ]
137
138
139 csvDecodeOptions :: DecodeOptions
140 csvDecodeOptions = (defaultDecodeOptions
141 {decDelimiter = fromIntegral $ ord '\t'}
142 )
143
144 csvEncodeOptions :: EncodeOptions
145 csvEncodeOptions = ( defaultEncodeOptions
146 {encDelimiter = fromIntegral $ ord '\t'}
147 )
148
149 ------------------------------------------------------------------------
150 ------------------------------------------------------------------------
151 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
152 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
153 <$> snd
154 <$> readCsv fp
155
156 ------------------------------------------------------------------------
157 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
158 readCsv fp = do
159 csvData <- BL.readFile fp
160 case decodeByNameWith csvDecodeOptions csvData of
161 Left e -> panic (pack e)
162 Right csvDocs -> pure csvDocs
163
164
165 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
166 writeCsv fp (h, vs) = BL.writeFile fp $
167 encodeByNameWith csvEncodeOptions h (V.toList vs)
168