2025-09-22: Added saves of retrieved ids

This commit is contained in:
Moussouni, Yaël
2025-09-22 12:41:47 +02:00
parent 3d5f53c890
commit f5150f51ce
2 changed files with 62 additions and 20 deletions

4
db/saved.txt Normal file
View File

@@ -0,0 +1,4 @@
2509.15307v1
2509.15337v1
2509.15720v1
2509.16168v1

View File

@@ -58,7 +58,7 @@ def get_entries(rss):
entries = rss["entries"]
return entries
def print_entries(entries, fields=None):
def print_entries(entries, fields=None, keywords=None):
for i in range(len(entries)):
entry = entries[i]
@@ -81,6 +81,11 @@ def print_entries(entries, fields=None):
+ "Filtered field(s): "
+ ", ".join(fields[i])
+ COLOUR_DEFAULT)
if keywords is not None:
print(COLOUR_ERROR
+ "Filtered keyword(s): "
+ ", ".join(keywords[i])
+ COLOUR_DEFAULT)
print("")
return 0
@@ -89,24 +94,24 @@ def get_filters():
filters_list = [f for f in os.listdir(FILTERS_DIR) if not f[0] == "."]
for i in range(len(filters_list)):
path = FILTERS_DIR + filters_list[i]
filter_file = open(path)
dic = {"fields": [], "values": []}
for line in filter_file.readlines():
if "#FIELD" in line:
field = line.split("=")[1].replace("\"", "").strip()
dic["fields"].append(field)
elif line[0] == "#" or line in [" \n", "\n", ""]:
continue
else:
value = line.replace("\n", "")
dic["values"].append(value)
filters.append(dic)
filter_file.close()
with open(path) as filter_file:
dic = {"fields": [], "values": []}
for line in filter_file.readlines():
if "#FIELD" in line:
field = line.split("=")[1].replace("\"", "").strip()
dic["fields"].append(field)
elif line[0] == "#" or line in [" \n", "\n", ""]:
continue
else:
value = line.replace("\n", "")
dic["values"].append(value)
filters.append(dic)
return filters
def filter_entries(filters, entries):
filtered_entries = []
filtered_fields = []
filtered_keywords = []
for entry in entries:
added = False
for filter_ in filters:
@@ -114,21 +119,52 @@ def filter_entries(filters, entries):
values = filter_["values"]
for field in fields:
for value in values:
if not added and value in entry[field]:
if not added and value.upper() in str(entry[field]).upper():
filtered_entries.append(entry)
filtered_fields.append([field])
filtered_keywords.append([value])
added = True
elif added and value in entry[field]:
elif added and value.upper() in str(entry[field]).upper():
if not field in filtered_fields[-1]:
filtered_fields[-1].append(field)
return filtered_entries, filtered_fields
if not value in filtered_keywords[-1]:
filtered_keywords[-1].append(value)
return filtered_entries, filtered_fields, filtered_keywords
def get_ids(entries):
ids = []
for entry in entries:
ids.append(entry["id"])
return ids
def save_ids(ids, library="saved"):
if isinstance(ids, list) or isinstance(ids, np.ndarray):
ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids]
elif isinstance(ids, str):
ids = [ids.replace("oai:", "").replace("arXiv.org:", "")]
else:
raise Exception("The type of ids ({}) is not recognized".format(type(ids)))
with open(DB_DIR + library + ".txt", "a+") as db_file:
None # creates the file if not already in the directory
with open(DB_DIR + library + ".txt", "r+") as db_file:
known_ids = [line.replace("\n", "") for line in db_file.readlines()]
print(known_ids)
with open(DB_DIR + library + ".txt", "a+") as db_file:
for i in ids:
if not i in known_ids:
db_file.write(i)
db_file.write("\n")
return 0
def today_arxiv():
filters = get_filters()
feed = get_rss()
entries = get_entries(feed)
entries, fields = filter_entries(filters, entries)
print_entries(entries, fields)
entries, fields, keywords = filter_entries(filters, entries)
ids = get_ids(entries)
save_ids(ids)
print_entries(entries, fields, keywords)
return entries, fields
def get_api_ids(ids):
@@ -142,11 +178,13 @@ def get_api_ids(ids):
feed = fp.parse(query)
return feed
"""
ids = ["oai:arXiv.org:2509.13163"]
feed = get_api_ids(ids)
entries = get_entries(feed)
print_entries(entries)
"""
today_arxiv()