]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
[MERGE] Upgrading postgresql and ngrams repo changes
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / FrameWrite.hs
1 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite where
2
3 import Control.Applicative ((*>))
4 import Control.Monad (void)
5 import Data.Either
6 import Data.Maybe
7 import Data.Text hiding (foldl)
8 import Gargantext.Prelude
9 import Prelude ((++), read)
10 import Text.Parsec hiding (Line)
11 import Text.Parsec.String
12
13
14 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
15
16 -- title : everything above the first ==
17 -- Authors : default : anonymous ; except if the following line is encountered ^@@authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
18 -- date : default : date of last change except if the following line is encountered ^@@date: 2021-09-10
19 -- source: Name of the root node except if the following line is encountered ^@@source:
20 -- By default, 1 framawrite node = 1 document. Option for further developments: allow to give a level at generation for the split within framawrite node : :
21 --
22 -- par défaut: un doc == 1 NodeWrite
23 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
24
25 sample :: Text
26 sample =
27 unlines
28 [ "title1"
29 , "title2"
30 , "=="
31 , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
32 , "^@@date: 2021-09-10"
33 , "^@@source: someSource"
34 , "document contents 1"
35 , "document contents 2"
36 ]
37
38 sampleUnordered :: Text
39 sampleUnordered =
40 unlines
41 [ "title1"
42 , "title2"
43 , "=="
44 , "document contents 1"
45 , "^@@date: 2021-09-10"
46 , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
47 , "^@@source: someSource"
48 , "document contents 2"
49 ]
50
51 -- parseSample = parse documentP "sample" (unpack sample)
52 -- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
53 parseLinesSample :: Either ParseError Parsed
54 parseLinesSample = parseLines sample
55 parseLinesSampleUnordered :: Either ParseError Parsed
56 parseLinesSampleUnordered = parseLines sampleUnordered
57
58 data Author =
59 Author { firstName :: Text
60 , lastName :: Text }
61 deriving (Show)
62
63 data Parsed =
64 Parsed { title :: Text
65 , authors :: [Author]
66 , date :: Maybe Date
67 , source :: Maybe Text
68 , contents :: Text }
69 deriving (Show)
70
71 emptyParsed :: Parsed
72 emptyParsed =
73 Parsed { title = ""
74 , authors = []
75 , date = Nothing
76 , source = Nothing
77 , contents = "" }
78
79 data Date =
80 Date { year :: Integer
81 , month :: Integer
82 , day :: Integer }
83 deriving (Show)
84
85 data Line =
86 LAuthors [Author]
87 | LContents Text
88 | LDate Date
89 | LSource Text
90 | LTitle Text
91 deriving (Show)
92
93 parseLines :: Text -> Either ParseError Parsed
94 parseLines text = foldl f emptyParsed <$> lst
95 where
96 lst = parse documentLinesP "" (unpack text)
97 f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
98 f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. }
99 f (Parsed { .. }) (LDate d ) = Parsed { date = Just d, .. }
100 f (Parsed { .. }) (LSource s ) = Parsed { source = Just s, .. }
101 f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. }
102
103 documentLinesP :: Parser [Line]
104 documentLinesP = do
105 t <- titleP
106 ls <- lineP `sepBy` newline
107 pure $ [LTitle $ pack t] ++ ls
108
109 lineP :: Parser Line
110 lineP = do
111 choice [ try authorsLineP
112 , try dateLineP
113 , try sourceLineP
114 , contentsLineP ]
115
116 authorsLineP :: Parser Line
117 authorsLineP = do
118 authors <- authorsP
119 pure $ LAuthors authors
120
121 dateLineP :: Parser Line
122 dateLineP = do
123 date <- dateP
124 pure $ LDate date
125
126 sourceLineP :: Parser Line
127 sourceLineP = do
128 source <- sourceP
129 pure $ LSource $ pack source
130
131 contentsLineP :: Parser Line
132 contentsLineP = do
133 contents <- many (noneOf "\n")
134 pure $ LContents $ pack contents
135
136 --------------------
137
138 -- documentP = do
139 -- t <- titleP
140 -- a <- optionMaybe authorsP
141 -- d <- optionMaybe dateP
142 -- s <- optionMaybe sourceP
143 -- c <- contentsP
144 -- pure $ Parsed { title = pack t
145 -- , authors = fromMaybe [] a
146 -- , date = pack <$> d
147 -- , source = pack <$> s
148 -- , contents = pack c }
149
150 titleDelimiterP :: Parser ()
151 titleDelimiterP = do
152 _ <- newline
153 _ <- string "=="
154 tokenEnd
155 titleP :: Parser [Char]
156 titleP = manyTill anyChar (try titleDelimiterP)
157
158 authorsPrefixP :: Parser [Char]
159 authorsPrefixP = do
160 _ <- string "^@@authors:"
161 many (char ' ')
162 authorsP :: Parser [Author]
163 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
164 authorP :: Parser Author
165 authorP = do
166 fn <- manyTill anyChar (char ',')
167 _ <- many (char ' ')
168 --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
169 --ln <- manyTill anyChar (tokenEnd)
170 ln <- many (noneOf "\n")
171 pure $ Author { firstName = pack fn, lastName = pack ln }
172 -- manyTill anyChar (void (char '\n') <|> eof)
173
174 datePrefixP :: Parser [Char]
175 datePrefixP = do
176 _ <- string "^@@date:"
177 many (char ' ')
178 dateP :: Parser Date
179 dateP = try datePrefixP
180 *> dateISOP
181 -- *> many (noneOf "\n")
182
183 dateISOP :: Parser Date
184 dateISOP = do
185 year <- rd <$> number
186 _ <- char '-'
187 month <- rd <$> number
188 _ <- char '-'
189 day <- rd <$> number
190 _ <- many (noneOf "\n" )
191 pure $ Date { year, month, day }
192 where
193 rd = read :: [Char] -> Integer
194 number = many1 digit
195
196 sourcePrefixP :: Parser [Char]
197 sourcePrefixP = do
198 _ <- string "^@@source:"
199 many (char ' ')
200 sourceP :: Parser [Char]
201 sourceP = try sourcePrefixP
202 *> many (noneOf "\n")
203
204 -- contentsP :: Parser String
205 -- contentsP = many anyChar
206
207 tokenEnd :: Parser ()
208 tokenEnd = void (char '\n') <|> eof