]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
[FIX] Advanced Bridgeness test
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / FrameWrite.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite
13 where
14
15 import Control.Applicative ((*>))
16 import Control.Monad (void)
17 import Data.Either
18 import Data.Maybe
19 import Data.Text hiding (foldl)
20 import Gargantext.Core.Text (sentences)
21 import Gargantext.Prelude
22 import Prelude ((++), read)
23 import Text.Parsec hiding (Line)
24 import Text.Parsec.String
25 import qualified Data.Text as DT
26 import qualified Data.List as List
27
28
29 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
30
31 -- Authors : default : anonymous ; except if the following line is encountered ^@@authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
32 -- date : default : date of last change except if the following line is encountered ^@@date: 2021-09-10
33 -- source: Name of the root node except if the following line is encountered ^@@source:
34 -- By default, 1 framawrite node = 1 document. Option for further developments: allow to give a level at generation for the split within framawrite node : :
35 --
36 -- par défaut: un doc == 1 NodeWrite
37 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
38
39 sample :: Text
40 sample =
41 unlines
42 [ "title1"
43 -- , "title2"
44 -- , "=="
45 -- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
46 , "date: 2021-09-10"
47 , "source: someSource"
48 , "document contents 1"
49 , "document contents 2"
50 ]
51
52 sampleUnordered :: Text
53 sampleUnordered =
54 unlines
55 [ "title1"
56 , "title2"
57 , "=="
58 , "document contents 1"
59 , "date: 2021-09-10"
60 , "authors: FirstName1, LastName1; FirstName2, LastName2"
61 , "source: someSource"
62 , "document contents 2"
63 ]
64
65 -- parseSample = parse documentP "sample" (unpack sample)
66 -- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
67 parseLinesSample :: Either ParseError Parsed
68 parseLinesSample = parseLines sample
69 parseLinesSampleUnordered :: Either ParseError Parsed
70 parseLinesSampleUnordered = parseLines sampleUnordered
71
72 data Author =
73 Author { firstName :: Text
74 , lastName :: Text }
75 deriving (Show)
76
77 data Parsed =
78 Parsed { title :: Text
79 , authors :: [Author]
80 , date :: Maybe Date
81 , source :: Maybe Text
82 , contents :: Text }
83 deriving (Show)
84
85 emptyParsed :: Parsed
86 emptyParsed =
87 Parsed { title = ""
88 , authors = []
89 , date = Nothing
90 , source = Nothing
91 , contents = "" }
92
93 data Date =
94 Date { year :: Integer
95 , month :: Integer
96 , day :: Integer }
97 deriving (Show)
98
99 data Line =
100 LAuthors [Author]
101 | LContents Text
102 | LDate Date
103 | LSource Text
104 | LTitle Text
105 deriving (Show)
106
107 parseLines :: Text -> Either ParseError Parsed
108 parseLines text = foldl f emptyParsed <$> lst
109 where
110 lst = parse documentLinesP "" (unpack text)
111 f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
112 f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. }
113 f (Parsed { .. }) (LDate d ) = Parsed { date = Just d, .. }
114 f (Parsed { .. }) (LSource s ) = Parsed { source = Just s, .. }
115 f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. }
116
117 documentLinesP :: Parser [Line]
118 documentLinesP = do
119 t <- titleP
120 ls <- lineP `sepBy` newline
121 pure $ [LTitle $ pack t] ++ ls
122
123 lineP :: Parser Line
124 lineP = do
125 choice [ try authorsLineP
126 , try dateLineP
127 , try sourceLineP
128 , contentsLineP ]
129
130 authorsLineP :: Parser Line
131 authorsLineP = do
132 authors <- authorsP
133 pure $ LAuthors authors
134
135 dateLineP :: Parser Line
136 dateLineP = do
137 date <- dateP
138 pure $ LDate date
139
140 sourceLineP :: Parser Line
141 sourceLineP = do
142 source <- sourceP
143 pure $ LSource $ pack source
144
145 contentsLineP :: Parser Line
146 contentsLineP = do
147 contents <- many (noneOf "\n")
148 pure $ LContents $ pack contents
149
150 --------------------
151
152 -- documentP = do
153 -- t <- titleP
154 -- a <- optionMaybe authorsP
155 -- d <- optionMaybe dateP
156 -- s <- optionMaybe sourceP
157 -- c <- contentsP
158 -- pure $ Parsed { title = pack t
159 -- , authors = fromMaybe [] a
160 -- , date = pack <$> d
161 -- , source = pack <$> s
162 -- , contents = pack c }
163
164 titleDelimiterP :: Parser ()
165 titleDelimiterP = do
166 _ <- newline
167 -- _ <- try (string "==")
168 pure ()
169 titleP :: Parser [Char]
170 titleP = manyTill anyChar (try titleDelimiterP)
171
172 authorsPrefixP :: Parser [Char]
173 authorsPrefixP = do
174 _ <- string "authors:"
175 many (char ' ')
176 authorsP :: Parser [Author]
177 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
178 authorP :: Parser Author
179 authorP = do
180 fn <- manyTill anyChar (char ',')
181 _ <- many (char ' ')
182 --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
183 --ln <- manyTill anyChar (tokenEnd)
184 ln <- many (noneOf "\n")
185 pure $ Author { firstName = pack fn, lastName = pack ln }
186 -- manyTill anyChar (void (char '\n') <|> eof)
187
188 datePrefixP :: Parser [Char]
189 datePrefixP = do
190 _ <- string "date:"
191 many (char ' ')
192 dateP :: Parser Date
193 dateP = try datePrefixP
194 *> dateISOP
195 -- *> many (noneOf "\n")
196
197 dateISOP :: Parser Date
198 dateISOP = do
199 year <- rd <$> number
200 _ <- char '-'
201 month <- rd <$> number
202 _ <- char '-'
203 day <- rd <$> number
204 _ <- many (noneOf "\n" )
205 pure $ Date { year, month, day }
206 where
207 rd = read :: [Char] -> Integer
208 number = many1 digit
209
210 sourcePrefixP :: Parser [Char]
211 sourcePrefixP = do
212 _ <- string "source:"
213 many (char ' ')
214 sourceP :: Parser [Char]
215 sourceP = try sourcePrefixP
216 *> many (noneOf "\n")
217
218 -- contentsP :: Parser String
219 -- contentsP = many anyChar
220
221 tokenEnd :: Parser ()
222 tokenEnd = void (char '\n') <|> eof
223
224 --- MISC Tools
225
226 text2paragraphs :: Int -> Text -> [Text]
227 text2paragraphs n = List.map DT.concat
228 . splitEvery n . List.map clean
229 . sentences . DT.concat . DT.lines
230
231 clean :: Text -> Text
232 clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
233
234