Mercurial > repos > iuc > openalex_explorer
comparison openalex_fetch.py @ 0:7a27a48d57c0 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/openalex commit 7bac5b8acf6091006591be468a252e57793db4d8
| author | iuc |
|---|---|
| date | Sat, 31 May 2025 12:25:39 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7a27a48d57c0 |
|---|---|
| 1 import argparse | |
| 2 import os | |
| 3 | |
| 4 import requests | |
| 5 | |
| 6 | |
| 7 # doi | |
| 8 def get_openalex_id_from_doi(doi): | |
| 9 url = f'https://api.openalex.org/works/https://doi.org/{doi}' | |
| 10 response = requests.get(url) | |
| 11 response.raise_for_status() | |
| 12 return response.json()['id'].split('/')[-1] | |
| 13 | |
| 14 | |
| 15 # title | |
| 16 def get_openalex_id_from_title(title): | |
| 17 url = f'https://api.openalex.org/works?search={title}' | |
| 18 response = requests.get(url) | |
| 19 response.raise_for_status() | |
| 20 results = response.json().get('results', []) | |
| 21 if not results: | |
| 22 raise ValueError("No paper found with the given title.") | |
| 23 return results[0]['id'].split('/')[-1] | |
| 24 | |
| 25 | |
| 26 # fetch papers | |
| 27 def fetch_citing_papers(openalex_id, max_citations=None): | |
| 28 all_citing_papers = [] | |
| 29 per_page = 200 | |
| 30 page = 1 | |
| 31 | |
| 32 work_url = f'https://api.openalex.org/works/{openalex_id}' | |
| 33 response = requests.get(work_url) | |
| 34 response.raise_for_status() | |
| 35 work_data = response.json() | |
| 36 | |
| 37 cited_by_url = work_data.get('cited_by_api_url') | |
| 38 if not cited_by_url: | |
| 39 raise ValueError("This work has no citing papers.") | |
| 40 | |
| 41 while True: | |
| 42 paged_url = f"{cited_by_url}&per_page={per_page}&page={page}" | |
| 43 response = requests.get(paged_url) | |
| 44 response.raise_for_status() | |
| 45 data = response.json() | |
| 46 | |
| 47 results = data.get('results', []) | |
| 48 if not results: | |
| 49 break | |
| 50 | |
| 51 all_citing_papers.extend(results) | |
| 52 | |
| 53 if max_citations and len(all_citing_papers) >= max_citations: | |
| 54 all_citing_papers = all_citing_papers[:max_citations] | |
| 55 break | |
| 56 | |
| 57 if len(results) < per_page: | |
| 58 break | |
| 59 | |
| 60 page += 1 | |
| 61 | |
| 62 return all_citing_papers | |
| 63 | |
| 64 | |
| 65 def download_pdf(url, title, folder_name): | |
| 66 try: | |
| 67 if not os.path.exists(folder_name): | |
| 68 os.makedirs(folder_name) | |
| 69 response = requests.get(url) | |
| 70 if response.status_code == 200: | |
| 71 safe_title = "".join(x for x in title if x.isalnum() or x in " _-").rstrip() | |
| 72 file_path = os.path.join(folder_name, f"{safe_title}.pdf") | |
| 73 with open(file_path, 'wb') as f: | |
| 74 f.write(response.content) | |
| 75 print(f"[✓] Downloaded: {file_path}") | |
| 76 else: | |
| 77 print(f"[x] Failed to download: {url}") | |
| 78 except Exception as e: | |
| 79 print(f"[!] Error downloading {url}: {e}") | |
| 80 | |
| 81 | |
| 82 def main(): | |
| 83 parser = argparse.ArgumentParser(description="Fetch citing papers from OpenAlex") | |
| 84 group = parser.add_mutually_exclusive_group(required=True) | |
| 85 group.add_argument('--id', help='OpenAlex ID of the paper (e.g., W2088676066)') | |
| 86 group.add_argument('--doi', help='DOI of the paper') | |
| 87 group.add_argument('--title', help='Title of the paper') | |
| 88 | |
| 89 parser.add_argument('--download', action='store_true', help='Download available OA PDFs') | |
| 90 parser.add_argument('--max-citations', type=str, default="50", dest='max_citations', help="Max citing papers to fetch or 'all'") | |
| 91 parser.add_argument('--output-dir', default='.', help='Directory to save output files') | |
| 92 args = parser.parse_args() | |
| 93 | |
| 94 output_dir = args.output_dir | |
| 95 summary_path = os.path.join(output_dir, "summary.txt") | |
| 96 tsv_path = os.path.join(output_dir, "citing_papers.tsv") | |
| 97 download_dir = os.path.join(output_dir, "downloads") | |
| 98 | |
| 99 if args.max_citations.lower() == "all": | |
| 100 max_citations = None | |
| 101 else: | |
| 102 max_citations = int(args.max_citations) | |
| 103 | |
| 104 try: | |
| 105 if args.title: | |
| 106 openalex_id = get_openalex_id_from_title(args.title) | |
| 107 elif args.doi: | |
| 108 openalex_id = get_openalex_id_from_doi(args.doi) | |
| 109 else: | |
| 110 openalex_id = args.id | |
| 111 | |
| 112 citing_papers = fetch_citing_papers(openalex_id, max_citations=max_citations) | |
| 113 | |
| 114 is_oa = 0 | |
| 115 is_not_oa = 0 | |
| 116 | |
| 117 for paper in citing_papers: | |
| 118 if not paper['locations']: | |
| 119 continue | |
| 120 location = paper['locations'][0] | |
| 121 is_open = location.get('is_oa', False) | |
| 122 landing_url = location.get('landing_page_url', 'No URL') | |
| 123 | |
| 124 if is_open: | |
| 125 is_oa += 1 | |
| 126 print("[OA]", landing_url) | |
| 127 if args.download: | |
| 128 pdf_url = location.get('pdf_url') | |
| 129 if pdf_url: | |
| 130 download_pdf(pdf_url, paper['title'], download_dir) | |
| 131 else: | |
| 132 print(f"[!] No direct PDF URL for: {paper['title']}") | |
| 133 | |
| 134 else: | |
| 135 is_not_oa += 1 | |
| 136 print("[Closed]", landing_url) | |
| 137 | |
| 138 print("\nSummary:") | |
| 139 print("Total citing papers:", len(citing_papers)) | |
| 140 print("Open Access papers:", is_oa) | |
| 141 print("Closed Access papers:", is_not_oa) | |
| 142 | |
| 143 # save summary | |
| 144 with open(summary_path, "w") as f: | |
| 145 f.write(f"Total citing papers: {len(citing_papers)}\n") | |
| 146 f.write(f"Open Access papers: {is_oa}\n") | |
| 147 f.write(f"Closed Access papers: {is_not_oa}\n") | |
| 148 | |
| 149 # save citing papers to a TSV file | |
| 150 with open(tsv_path, "w", encoding="utf-8") as f: | |
| 151 f.write("Title\tDOI\tIs_OA\n") | |
| 152 for paper in citing_papers: | |
| 153 raw_title = paper.get("title") or "N/A" | |
| 154 title = raw_title.replace("\t", " ") | |
| 155 doi = paper.get("doi", "N/A") | |
| 156 location = paper['locations'][0] if paper['locations'] else {} | |
| 157 is_oa = location.get("is_oa", False) | |
| 158 # landing_url = location.get("landing_page_url", "N/A") | |
| 159 # pdf_url = location.get("pdf_url", "N/A") | |
| 160 | |
| 161 f.write(f"{title}\t{doi}\t{is_oa}\n") | |
| 162 | |
| 163 except Exception as e: | |
| 164 print(f"[!] Error: {e}") | |
| 165 | |
| 166 | |
| 167 if __name__ == '__main__': | |
| 168 main() |
