mirror of
https://codeberg.org/Yael-II/ArXtic.git
synced 2026-03-14 22:06:27 +01:00
2025-10-09: Manual update
This commit is contained in:
@@ -31,6 +31,7 @@
|
||||
## Directories
|
||||
VENV_DIR="./venv/"
|
||||
DB_DIR="./db/"
|
||||
PDF_DIR="../Bibliography/"
|
||||
FILTERS_DIR="./cfg/filters/"
|
||||
|
||||
## API Keys
|
||||
@@ -41,4 +42,5 @@ ADSABS_API_KEY="$(cat cfg/adsabs.secret)"
|
||||
ARXIV_QUERY_URL="https://export.arxiv.org/api/query?"
|
||||
ARXIV_RSS_URL="https://rss.arxiv.org/atom/astro-ph"
|
||||
|
||||
ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?q="
|
||||
ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?"
|
||||
ADSABS_EXPORT_URL="https://api.adsabs.harvard.edu/v1/export/bibtexabs/"
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
bibtexparser==2.0.0b8
|
||||
certifi==2025.10.5
|
||||
charset-normalizer==3.4.3
|
||||
feedparser==6.0.12
|
||||
idna==3.10
|
||||
numpy==2.3.3
|
||||
pylatexenc==3.0a33
|
||||
requests==2.32.5
|
||||
sgmllib3k==1.0.0
|
||||
urllib3==2.5.0
|
||||
|
||||
3
setup.sh
3
setup.sh
@@ -30,7 +30,8 @@ source activate.sh
|
||||
echo "=== [ Directory setup ] ==="
|
||||
mkdir -p -v cfg/filters
|
||||
mkdir -p -v db
|
||||
touch db/saved.txt
|
||||
touch db/read.bib
|
||||
touch db/unread.bib
|
||||
chmod u+x *.sh
|
||||
echo "=== Done ==="
|
||||
|
||||
|
||||
238
src/arxtic.py
238
src/arxtic.py
@@ -33,12 +33,19 @@ along with this program. If not, see www.gnu.org/licenses/.
|
||||
import os
|
||||
import textwrap as tw
|
||||
import feedparser as fp
|
||||
import bibtexparser as bib
|
||||
import requests as rq
|
||||
import numpy as np
|
||||
from urllib.parse import urlencode, quote_plus
|
||||
|
||||
FILTERS_DIR = os.environ.get("FILTERS_DIR")
|
||||
DB_DIR = os.environ.get("DB_DIR")
|
||||
PDF_DIR = os.environ.get("PDF_DIR")
|
||||
ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL")
|
||||
ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL")
|
||||
ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL")
|
||||
ADSABS_EXPORT_URL = os.environ.get("ADSABS_EXPORT_URL")
|
||||
ADSABS_API_KEY = os.environ.get("ADSABS_API_KEY")
|
||||
|
||||
COLOUR_DEFAULT="\033[0m"
|
||||
COLOUR_INPUT="\033[36m"
|
||||
@@ -53,9 +60,7 @@ def wrap(txt, length=80):
|
||||
wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False))
|
||||
return wrapped_txt
|
||||
|
||||
def get_entries(rss):
|
||||
entries = rss["entries"]
|
||||
return entries
|
||||
## Filters
|
||||
|
||||
def get_filters():
|
||||
filters = []
|
||||
@@ -63,11 +68,14 @@ def get_filters():
|
||||
for i in range(len(filters_list)):
|
||||
path = FILTERS_DIR + filters_list[i]
|
||||
with open(path) as filter_file:
|
||||
dic = {"fields": [], "values": []}
|
||||
dic = {"fields": [], "values": [], "score": 1}
|
||||
for line in filter_file.readlines():
|
||||
if "#FIELD" in line:
|
||||
field = line.split("=")[1].replace("\"", "").strip()
|
||||
dic["fields"].append(field)
|
||||
elif "#SCORE" in line:
|
||||
field = line.split("=")[1].strip()
|
||||
dic["score"] = int(field)
|
||||
elif line[0] == "#" or line in [" \n", "\n", ""]:
|
||||
continue
|
||||
else:
|
||||
@@ -76,36 +84,50 @@ def get_filters():
|
||||
filters.append(dic)
|
||||
return filters
|
||||
|
||||
## ArXiV Entries
|
||||
|
||||
def filter_entries(filters, entries):
|
||||
filtered_entries = []
|
||||
filtered_fields = []
|
||||
filtered_keywords = []
|
||||
filtered_score = []
|
||||
for entry in entries:
|
||||
added = False
|
||||
for filter_ in filters:
|
||||
fields = filter_["fields"]
|
||||
values = filter_["values"]
|
||||
score = filter_["score"]
|
||||
for field in fields:
|
||||
for value in values:
|
||||
if not added and value.upper() in str(entry[field]).upper():
|
||||
if field in list(entry):
|
||||
val = entry[field]
|
||||
else:
|
||||
val = ""
|
||||
if not added and value.upper() in str(val).upper():
|
||||
filtered_entries.append(entry)
|
||||
filtered_fields.append([field])
|
||||
filtered_keywords.append([value])
|
||||
filtered_score.append(score)
|
||||
added = True
|
||||
elif added and value.upper() in str(entry[field]).upper():
|
||||
elif added and value.upper() in str(val).upper():
|
||||
filtered_score[-1] = filtered_score[-1] + score
|
||||
if not field in filtered_fields[-1]:
|
||||
filtered_fields[-1].append(field)
|
||||
if not value in filtered_keywords[-1]:
|
||||
filtered_keywords[-1].append(value)
|
||||
return filtered_entries, filtered_fields, filtered_keywords
|
||||
filtered_data = {"fields": filtered_fields,
|
||||
"keywords": filtered_keywords,
|
||||
"score": filtered_score}
|
||||
return filtered_entries, filtered_data
|
||||
|
||||
def print_entries(entries, fields=None, keywords=None):
|
||||
## Print entries
|
||||
|
||||
def print_entries(entries, data=None):
|
||||
for i in range(len(entries)):
|
||||
entry = entries[i]
|
||||
|
||||
print(COLOUR_INFO, end="")
|
||||
if "bibcode" in list(entry):
|
||||
print(entry["bibcode"], end="")
|
||||
if "id" in list(entry):
|
||||
print(entry["id"], end="")
|
||||
if "arxiv_announce_type" in list(entry) :
|
||||
print(" (" + entry["arxiv_announce_type"] + ")", end="")
|
||||
@@ -114,25 +136,28 @@ def print_entries(entries, fields=None, keywords=None):
|
||||
|
||||
print(COLOUR_DEFAULT + wrap(entry["title"]) + COLOUR_DEFAULT)
|
||||
print(COLOUR_OUTPUT
|
||||
+ wrap(", ".join([a["name"] for a in entry["authors"]]))
|
||||
+ wrap(", ".join(entry["author"]))
|
||||
+ COLOUR_DEFAULT)
|
||||
print(COLOUR_INPUT
|
||||
+ wrap("\n".join(entry["summary"].split("\n")[1:]))
|
||||
+ wrap(entry["abstract"])
|
||||
+ COLOUR_DEFAULT)
|
||||
if fields is not None:
|
||||
if data is not None:
|
||||
print(COLOUR_ERROR
|
||||
+ "Filtered field(s): "
|
||||
+ ", ".join(fields[i])
|
||||
+ ", ".join(data["fields"][i])
|
||||
+ COLOUR_DEFAULT)
|
||||
if keywords is not None:
|
||||
print(COLOUR_ERROR
|
||||
+ "Filtered keyword(s): "
|
||||
+ ", ".join(keywords[i])
|
||||
+ ", ".join(data["keywords"][i])
|
||||
+ COLOUR_DEFAULT)
|
||||
print(COLOUR_ERROR
|
||||
+ "Filtered score: "
|
||||
+ str(data["score"][i])
|
||||
+ COLOUR_DEFAULT)
|
||||
print("")
|
||||
return 0
|
||||
|
||||
# ArXiV IDs
|
||||
# IDs
|
||||
|
||||
def get_arxiv_ids(entries):
|
||||
ids = []
|
||||
@@ -169,12 +194,10 @@ def get_arxiv_rss():
|
||||
def today_arxiv():
|
||||
filters = get_filters()
|
||||
feed = get_arxiv_rss()
|
||||
entries = get_entries(feed)
|
||||
entries, fields, keywords = filter_entries(filters, entries)
|
||||
ids = get_arxiv_ids(entries)
|
||||
save_arxiv_ids(ids)
|
||||
print_entries(entries, fields, keywords)
|
||||
return entries, fields, keywords
|
||||
entries = get_arxiv_entries(feed)
|
||||
entries, data = filter_entries(filters, entries)
|
||||
print_entries(entries, data)
|
||||
return entries, data
|
||||
|
||||
def get_arxiv_from_ids(ids):
|
||||
if isinstance(ids, list) or isinstance(ids, np.ndarray):
|
||||
@@ -185,13 +208,174 @@ def get_arxiv_from_ids(ids):
|
||||
raise Exception(
|
||||
"The type of ids ({}) is not recognized".format(type(ids))
|
||||
)
|
||||
query = ARXIV_QUERY_URL + "id_list=" + ",".join(ids)
|
||||
feed = fp.parse(query)
|
||||
query = urlencode({"id_list": ",".join(ids)})
|
||||
url = ARXIV_QUERY_URL + query
|
||||
feed = fp.parse(url)
|
||||
return feed
|
||||
|
||||
## ADS-ABS
|
||||
|
||||
def get_adsabs_from_ids(ids):
|
||||
def ads_search(query, num=5, sort="date"):
|
||||
query = urlencode({"q": query,
|
||||
"fl": ("bibcode,title,author,abstract,bibstem,doi,"
|
||||
"keyword,citation,pubdate"),
|
||||
"rows": num,
|
||||
"sort": sort})
|
||||
url = ADSABS_QUERY_URL + query
|
||||
header = "Bearer " + ADSABS_API_KEY
|
||||
feed = rq.get(url, headers={'Authorization': header})
|
||||
return feed
|
||||
|
||||
def ads_author(author, num=10, sort="date"):
|
||||
filters = get_filters()
|
||||
feed = ads_search("author:" + author, num=num, sort=sort)
|
||||
entries = get_ads_entries(feed)
|
||||
entries, data = filter_entries(filters, entries)
|
||||
print_entries(entries, data)
|
||||
return entries, data
|
||||
|
||||
# Entries
|
||||
|
||||
def get_arxiv_entries(rss):
|
||||
entries_old = rss["entries"]
|
||||
entries = []
|
||||
for entry_old in entries_old:
|
||||
entry = {}
|
||||
entry["id"] = entry_old["id"].replace("oai:", "").replace("arXiv.org:", "")
|
||||
entry["link"] = entry_old["link"]
|
||||
entry["title"] = entry_old["title"]
|
||||
tmp = []
|
||||
for element in entry_old["authors"]:
|
||||
if isinstance(element, dict):
|
||||
tmp += element["name"].split(",")
|
||||
entry["author"] = [a.strip() for a in tmp]
|
||||
entry["abstract"] = "\n".join(entry_old["summary"].split("\n")[1:])[10:]
|
||||
entry["pubdate"] = entry_old["published"][0:10]
|
||||
entries.append(entry)
|
||||
return entries
|
||||
|
||||
|
||||
def get_ads_entries(feed):
|
||||
num = len(feed.json()["response"]["docs"])
|
||||
entries = []
|
||||
for i in range(num):
|
||||
entry = feed.json()["response"]["docs"][i]
|
||||
entry["link"] = "https://ui.adsabs.harvard.edu/abs/" + entry["bibcode"]
|
||||
entry["title"] = entry["title"][0]
|
||||
entry["publisher"] = entry["bibstem"][0]
|
||||
entries.append(entry)
|
||||
return entries
|
||||
|
||||
# BibTeX
|
||||
|
||||
def arxiv_to_bibtex(entry,
|
||||
arxtic_notes = "",
|
||||
arxtic_category = "",
|
||||
arxtic_keywords = "",
|
||||
arxtic_score = 0,
|
||||
arxtic_filename = ""):
|
||||
key = entry["id"]
|
||||
title = entry["title"]
|
||||
author = " and ".join(entry["author"])
|
||||
year = entry["pubdate"][0:4]
|
||||
eprint = key
|
||||
url = entry["link"]
|
||||
bibentry = (f"@misc{{{key},\n"
|
||||
f"\ttitle={{{title}}},\n"
|
||||
f"\tauthor={{{author}}},\n"
|
||||
f"\tyear={{{year}}},\n"
|
||||
f"\teprint={{{eprint}}},\n"
|
||||
f"\turl={{{url}}},\n"
|
||||
f"\tarxtic_notes={{{arxtic_notes}}},\n"
|
||||
f"\tarxtic_category={{{arxtic_category}}},\n"
|
||||
f"\tarxtic_keywords={{{arxtic_keywords}}},\n"
|
||||
f"\tarxtic_score={{{str(arxtic_score)}}},\n"
|
||||
f"\tarxtic_filename={{{str(arxtic_filename)}}},\n"
|
||||
"}")
|
||||
bibtex = bib.parse_string(bibentry)
|
||||
return bibtex
|
||||
|
||||
def ads_to_bibtex(entry,
|
||||
arxtic_notes = "",
|
||||
arxtic_category = "",
|
||||
arxtic_keywords = "",
|
||||
arxtic_score = 0,
|
||||
arxtic_filename = ""):
|
||||
bibcode = entry["bibcode"]
|
||||
url = ADSABS_EXPORT_URL + bibcode
|
||||
header = "Bearer " + ADSABS_API_KEY
|
||||
feed = rq.get(url, headers={'Authorization': header})
|
||||
bibentry = feed.text
|
||||
bibentry = bibentry[:-2]
|
||||
bibentry += (",\n"
|
||||
f"\tarxtic_notes={{{arxtic_notes}}},\n"
|
||||
f"\tarxtic_category={{{arxtic_category}}},\n"
|
||||
f"\tarxtic_keywords={{{arxtic_keywords}}},\n"
|
||||
f"\tarxtic_score={{{str(arxtic_score)}}},\n"
|
||||
f"\tarxtic_filename={{{str(arxtic_filename)}}},\n"
|
||||
"}")
|
||||
bibtex = bib.parse_string(bibentry)
|
||||
return bibtex
|
||||
|
||||
def list_pdf():
|
||||
bibtex_list = []
|
||||
pdf_names = [f for f in os.listdir(PDF_DIR)
|
||||
if not f[0] == "." and ".pdf" in f]
|
||||
for pdf_name in pdf_names:
|
||||
fields = pdf_name.replace(".pdf", "").split("_")
|
||||
if len(fields) < 2:
|
||||
print(COLOUR_WARNING
|
||||
+ f"Warning: {pdf_name} has not been correctly identified. "
|
||||
+ "(unrecognized format #1)"
|
||||
+ COLOUR+DEFAULT)
|
||||
elif fields[1].upper() == "ARXIV":
|
||||
arxiv_id = "/".join(fields[2:])
|
||||
feed = get_arxiv_from_ids(arxiv_id)
|
||||
entries = get_arxiv_entries(feed)
|
||||
if len(entries) == 1:
|
||||
entry = entries[0]
|
||||
bibtex = arxiv_to_bibtex(entry,
|
||||
arxtic_score=99,
|
||||
arxtic_filename=pdf_name)
|
||||
bibtex_list.append(bibtex)
|
||||
else:
|
||||
print(COLOUR_WARNING
|
||||
+ f"Warning: {pdf_name} has not been correctly identified. "
|
||||
+ "(ambiguous #1)"
|
||||
+ COLOUR_DEFAULT)
|
||||
elif len(fields) == 5:
|
||||
first_author = fields[0]
|
||||
year = fields[1]
|
||||
bibstem = fields[2]
|
||||
volume = fields[3]
|
||||
page = fields[4]
|
||||
if bibstem == "AA": bibstem = "A&A"
|
||||
query=(f"first_author:\"{first_author}\""
|
||||
f"year:({year})"
|
||||
f"bibstem:\"{bibstem}\""
|
||||
f"volume:\"{volume}\""
|
||||
f"page:\"{page}\"")
|
||||
feed = ads_search(query, num=2)
|
||||
entries = get_ads_entries(feed)
|
||||
if len(entries) == 1:
|
||||
entry = entries[0]
|
||||
bibtex = ads_to_bibtex(entry,
|
||||
arxtic_score=99,
|
||||
arxtic_filename=pdf_name)
|
||||
bibtex_list.append(bibtex)
|
||||
else:
|
||||
print(COLOUR_WARNING
|
||||
+ f"Warning: {pdf_name} has not been correctly identified. "
|
||||
+ "(ambiguous #2)"
|
||||
+ COLOUR_DEFAULT)
|
||||
else:
|
||||
print(COLOUR_WARNING
|
||||
+ f"Warning: {pdf_name} has not been correctly identified. "
|
||||
+ "(unrecognized format #2)"
|
||||
+ COLOUR_DEFAULT)
|
||||
return None
|
||||
|
||||
entries, fields, keywords = today_arxiv()
|
||||
|
||||
list_pdf()
|
||||
|
||||
#entries, data = today_arxiv()
|
||||
|
||||
Reference in New Issue
Block a user