]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[Pipeline] clustering with C++ Louvain bindings, ok.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
38
39 ---------------------------------------------------------------
40 data Doc = Doc
41 { d_docId :: !Int
42 , d_title :: !Text
43 , d_source :: !Text
44 , d_publication_year :: !Int
45 , d_publication_month :: !Int
46 , d_publication_day :: !Int
47 , d_abstract :: !Text
48 , d_authors :: !Text
49 }
50 deriving (Show)
51 ---------------------------------------------------------------
52 toDocs :: Vector CsvDoc -> [Doc]
53 toDocs v = V.toList
54 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
55 -> Doc nId t s py pm pd abst auth )
56 (V.enumFromN 1 (V.length v'')) v''
57 where
58 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
59 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
60
61 ---------------------------------------------------------------
62 fromDocs :: Vector Doc -> Vector CsvDoc
63 fromDocs docs = V.map fromDocs' docs
64 where
65 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
66
67 ---------------------------------------------------------------
68 -- | Split a document in its context
69 -- TODO adapt the size of the paragraph according to the corpus average
70
71
72 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
73 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
74 if docSize > 1000
75 then
76 if (mod (round m) docSize) >= 10
77 then
78 splitDoc' splt doc
79 else
80 V.fromList [doc]
81 else
82 V.fromList [doc]
83
84
85 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
86 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
87 where
88 firstDoc = CsvDoc t s py pm pd firstAbstract auth
89 firstAbstract = head' abstracts
90
91 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
92
93 abstracts = (splitBy $ contextSize) abst
94 head' x = maybe "" identity (head x)
95 tail' x = maybe [""] identity (tailMay x)
96
97 ---------------------------------------------------------------
98 ---------------------------------------------------------------
99 type Mean = Double
100
101 docsSize :: Vector CsvDoc -> Mean
102 docsSize csvDoc = mean ls
103 where
104 ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
105
106
107 ---------------------------------------------------------------
108 data CsvDoc = CsvDoc
109 { c_title :: !Text
110 , c_source :: !Text
111 , c_publication_year :: !Int
112 , c_publication_month :: !Int
113 , c_publication_day :: !Int
114 , c_abstract :: !Text
115 , c_authors :: !Text
116 }
117 deriving (Show)
118
119 instance FromNamedRecord CsvDoc where
120 parseNamedRecord r = CsvDoc <$> r .: "title"
121 <*> r .: "source"
122 <*> r .: "publication_year"
123 <*> r .: "publication_month"
124 <*> r .: "publication_day"
125 <*> r .: "abstract"
126 <*> r .: "authors"
127
128 instance ToNamedRecord CsvDoc where
129 toNamedRecord (CsvDoc t s py pm pd abst aut) =
130 namedRecord [ "title" .= t
131 , "source" .= s
132 , "publication_year" .= py
133 , "publication_month" .= pm
134 , "publication_day" .= pd
135 , "abstract" .= abst
136 , "authors" .= aut
137 ]
138
139
140 csvDecodeOptions :: DecodeOptions
141 csvDecodeOptions = (defaultDecodeOptions
142 {decDelimiter = fromIntegral $ ord '\t'}
143 )
144
145 csvEncodeOptions :: EncodeOptions
146 csvEncodeOptions = ( defaultEncodeOptions
147 {encDelimiter = fromIntegral $ ord '\t'}
148 )
149
150
151 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
152 readCsv fp = do
153 csvData <- BL.readFile fp
154 case decodeByNameWith csvDecodeOptions csvData of
155 Left e -> panic (pack e)
156 Right csvDocs -> pure csvDocs
157
158
159 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
160 writeCsv fp (h, vs) = BL.writeFile fp $
161 encodeByNameWith csvEncodeOptions h (V.toList vs)
162