From f5150f51cec5e97e728080f938fef6e06dbb6272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moussouni=2C=20Ya=C3=ABl?= Date: Mon, 22 Sep 2025 12:41:47 +0200 Subject: [PATCH] 2025-09-22: Added saves of retrieved ids --- db/saved.txt | 4 +++ src/arxtic.py | 78 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 62 insertions(+), 20 deletions(-) create mode 100644 db/saved.txt diff --git a/db/saved.txt b/db/saved.txt new file mode 100644 index 0000000..91aa528 --- /dev/null +++ b/db/saved.txt @@ -0,0 +1,4 @@ +2509.15307v1 +2509.15337v1 +2509.15720v1 +2509.16168v1 diff --git a/src/arxtic.py b/src/arxtic.py index 4fa0de0..7ead428 100644 --- a/src/arxtic.py +++ b/src/arxtic.py @@ -58,7 +58,7 @@ def get_entries(rss): entries = rss["entries"] return entries -def print_entries(entries, fields=None): +def print_entries(entries, fields=None, keywords=None): for i in range(len(entries)): entry = entries[i] @@ -81,6 +81,11 @@ def print_entries(entries, fields=None): + "Filtered field(s): " + ", ".join(fields[i]) + COLOUR_DEFAULT) + if keywords is not None: + print(COLOUR_ERROR + + "Filtered keyword(s): " + + ", ".join(keywords[i]) + + COLOUR_DEFAULT) print("") return 0 @@ -89,24 +94,24 @@ def get_filters(): filters_list = [f for f in os.listdir(FILTERS_DIR) if not f[0] == "."] for i in range(len(filters_list)): path = FILTERS_DIR + filters_list[i] - filter_file = open(path) - dic = {"fields": [], "values": []} - for line in filter_file.readlines(): - if "#FIELD" in line: - field = line.split("=")[1].replace("\"", "").strip() - dic["fields"].append(field) - elif line[0] == "#" or line in [" \n", "\n", ""]: - continue - else: - value = line.replace("\n", "") - dic["values"].append(value) - filters.append(dic) - filter_file.close() + with open(path) as filter_file: + dic = {"fields": [], "values": []} + for line in filter_file.readlines(): + if "#FIELD" in line: + field = line.split("=")[1].replace("\"", "").strip() + dic["fields"].append(field) + elif line[0] == "#" or line in [" \n", "\n", ""]: + continue + else: + value = line.replace("\n", "") + dic["values"].append(value) + filters.append(dic) return filters def filter_entries(filters, entries): filtered_entries = [] filtered_fields = [] + filtered_keywords = [] for entry in entries: added = False for filter_ in filters: @@ -114,21 +119,52 @@ def filter_entries(filters, entries): values = filter_["values"] for field in fields: for value in values: - if not added and value in entry[field]: + if not added and value.upper() in str(entry[field]).upper(): filtered_entries.append(entry) filtered_fields.append([field]) + filtered_keywords.append([value]) added = True - elif added and value in entry[field]: + elif added and value.upper() in str(entry[field]).upper(): if not field in filtered_fields[-1]: filtered_fields[-1].append(field) - return filtered_entries, filtered_fields + if not value in filtered_keywords[-1]: + filtered_keywords[-1].append(value) + return filtered_entries, filtered_fields, filtered_keywords + +def get_ids(entries): + ids = [] + for entry in entries: + ids.append(entry["id"]) + return ids + +def save_ids(ids, library="saved"): + if isinstance(ids, list) or isinstance(ids, np.ndarray): + ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids] + elif isinstance(ids, str): + ids = [ids.replace("oai:", "").replace("arXiv.org:", "")] + else: + raise Exception("The type of ids ({}) is not recognized".format(type(ids))) + with open(DB_DIR + library + ".txt", "a+") as db_file: + None # creates the file if not already in the directory + with open(DB_DIR + library + ".txt", "r+") as db_file: + known_ids = [line.replace("\n", "") for line in db_file.readlines()] + + print(known_ids) + with open(DB_DIR + library + ".txt", "a+") as db_file: + for i in ids: + if not i in known_ids: + db_file.write(i) + db_file.write("\n") + return 0 def today_arxiv(): filters = get_filters() feed = get_rss() entries = get_entries(feed) - entries, fields = filter_entries(filters, entries) - print_entries(entries, fields) + entries, fields, keywords = filter_entries(filters, entries) + ids = get_ids(entries) + save_ids(ids) + print_entries(entries, fields, keywords) return entries, fields def get_api_ids(ids): @@ -142,11 +178,13 @@ def get_api_ids(ids): feed = fp.parse(query) return feed + + + """ ids = ["oai:arXiv.org:2509.13163"] feed = get_api_ids(ids) entries = get_entries(feed) print_entries(entries) """ - today_arxiv()