2025-09-22: RSS feed query and parsing

This commit is contained in:
Moussouni, Yaël
2025-09-22 10:31:25 +02:00
parent e52a0df96b
commit 5277548289
5 changed files with 98 additions and 7 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -1,5 +1,7 @@
#FIELD="author"
Moussouni Moussouni
Voggel Voggel
Lan\ccon
Lançon Lançon
Lancon Lancon
Hilker Hilker

View File

@@ -1,7 +1,18 @@
#FIELD="title"
#FIELD="title_detail"
#FIELD="tags"
#FIELD="summary"
# GC
Globular cluster
Globular clusters Globular clusters
Galaxy: globular cluster
Galaxy: globular clusters Galaxy: globular clusters
Galaxy: globular clusters
Galaxies: clusters: intracluster medium
GC GC
GCs GCs
# ICM
Intracluster medium
Clusters: intracluster medium
Galaxies: clusters: intracluster medium

View File

@@ -30,5 +30,12 @@
## Directories ## Directories
VENV_DIR="./venv/" VENV_DIR="./venv/"
DB_DIR="./db/"
FILTERS_DIR="./cfg/filters/"
## Remote URLs
QUERY_URL="https://export.arxiv.org/api/query?" QUERY_URL="https://export.arxiv.org/api/query?"
RSS_URL="https://rss.arxiv.org/atom/astro-ph" RSS_URL="https://rss.arxiv.org/atom/astro-ph"
## FILTERS
FILTERS_LIST="authors.txt,keywords.txt"

View File

@@ -33,8 +33,11 @@ along with this program. If not, see www.gnu.org/licenses/.
import os import os
import feedparser as fp import feedparser as fp
FILTERS_DIR = os.environ.get("FILTERS_DIR")
DB_DIR = os.environ.get("DB_DIR")
QUERY_URL = os.environ.get("QUERY_URL") QUERY_URL = os.environ.get("QUERY_URL")
RSS_URL = os.environ.get("RSS_URL") RSS_URL = os.environ.get("RSS_URL")
FILTERS_LIST = os.environ.get("FILTERS_LIST").split(",")
COLOUR_DEFAULT="\033[0m" COLOUR_DEFAULT="\033[0m"
COLOUR_INPUT="\033[36m" COLOUR_INPUT="\033[36m"
@@ -43,11 +46,79 @@ COLOUR_INFO="\033[34m"
COLOUR_WARNING="\033[93m" COLOUR_WARNING="\033[93m"
COLOUR_ERROR="\033[91m" COLOUR_ERROR="\033[91m"
d = fp.parse(RSS_URL) def get_rss():
rss = fp.parse(RSS_URL)
return rss
fields = list(d) def get_entries(rss):
entries = d["entries"] entries = rss["entries"]
return entries
def print_entries(entries, fields=None):
for i in range(len(entries)): for i in range(len(entries)):
print(entries[i]["published"] + ": " + entries[i]["id"]) entry = entries[i]
print(COLOUR_INFO
+ entry["id"]
+ " ("
+ entry["arxiv_announce_type"]
+ ") ["
+ entry["link"]
+ "]"
+ COLOUR_DEFAULT)
print(COLOUR_DEFAULT + entry["title"] + COLOUR_DEFAULT)
print(COLOUR_OUTPUT + entry["author"] + COLOUR_DEFAULT)
print(COLOUR_INPUT + "\n".join(entry["summary"].split("\n")[1:]) + COLOUR_DEFAULT)
if fields is not None:
print(COLOUR_ERROR
+ "Filtered field(s): "
+ ", ".join(fields[i])
+ COLOUR_DEFAULT)
print("")
return 0
def get_filters():
filters = []
for i in range(len(FILTERS_LIST)):
path = FILTERS_DIR + FILTERS_LIST[i]
filter_file = open(path)
dic = {"fields": [], "values": []}
for line in filter_file.readlines():
if "#FIELD" in line:
field = line.split("=")[1].replace("\"", "").strip()
dic["fields"].append(field)
elif line[0] == "#" or line in [" \n", "\n", ""]:
continue
else:
value = line.replace("\n", "")
dic["values"].append(value)
filters.append(dic)
filter_file.close()
return filters
def filter_entries(filters, entries):
filtered_entries = []
filtered_fields = []
for entry in entries:
added = False
for filter_ in filters:
fields = filter_["fields"]
values = filter_["values"]
for field in fields:
for value in values:
if not added and value in entry[field]:
filtered_entries.append(entry)
filtered_fields.append([field])
added = True
elif added and value in entry[field]:
filtered_fields[-1].append(field)
return filtered_entries, filtered_fields
def today_arxiv():
filters = get_filters()
rss = get_rss()
entries = get_entries(rss)
entries, fields = filter_entries(filters, entries)
print_entries(entries, fields)
return entries, fields
## Find using arxiv id