Skip to content

Commit 5759903

Browse files
This is a component designed to download the Xview dataset
Signed-off-by: weingartlorenz <[email protected]>
1 parent 8a05abd commit 5759903

File tree

4 files changed

+331
-0
lines changed

4 files changed

+331
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
cwlVersion: v1.2
2+
class: CommandLineTool
3+
4+
baseCommand: "claimed"
5+
6+
inputs:
7+
component:
8+
type: string
9+
default: docker.io/mdorzweiler/claimed-input-xview-download:0.1
10+
inputBinding:
11+
position: 1
12+
prefix: --component
13+
log_level:
14+
type: string
15+
default: "INFO"
16+
inputBinding:
17+
position: 2
18+
prefix: --log_level
19+
username:
20+
type: string
21+
default: None
22+
inputBinding:
23+
position: 3
24+
prefix: --username
25+
password:
26+
type: string
27+
default: None
28+
inputBinding:
29+
position: 4
30+
prefix: --password
31+
move_to_dir:
32+
type: string
33+
default: None
34+
inputBinding:
35+
position: 5
36+
prefix: --move_to_dir
37+
chromedriver_path:
38+
type: string
39+
default: None
40+
inputBinding:
41+
position: 6
42+
prefix: --chromedriver_path
43+
max_download_time:
44+
type: string
45+
default: None
46+
inputBinding:
47+
position: 7
48+
prefix: --max_download_time
49+
label:
50+
type: string
51+
default: None
52+
inputBinding:
53+
position: 8
54+
prefix: --label
55+
56+
57+
outputs: []
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "147f9480",
6+
"metadata": {},
7+
"source": [
8+
"## Xview Dataset Download \n",
9+
"\n",
10+
"This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. \n"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"id": "c185c1f0",
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"!pip install selenium"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": null,
26+
"id": "dc0554b5",
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"\n",
31+
"import os\n",
32+
"import shutil\n",
33+
"import time\n",
34+
"from selenium import webdriver\n",
35+
"from selenium.webdriver.common.by import By\n",
36+
"from selenium.webdriver.support.ui import WebDriverWait\n",
37+
"from selenium.webdriver.support import expected_conditions as EC\n",
38+
"from urllib.parse import urlparse\n"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": null,
44+
"id": "866d16c3",
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"\n",
49+
"# username for the Xview webpage to authorize login\n",
50+
"username = os.environ.get('username')\n",
51+
"\n",
52+
"# password for the Xview webpage to authorize login\n",
53+
"password = os.environ.get('password')\n",
54+
"\n",
55+
"# move_to_dir the directory where the dataset should be saved\n",
56+
"move_to_dir = os.environ.get('move_to_dir')\n",
57+
"\n",
58+
"# chromedriver_path the directory where the local copy of chromedriver is saved\n",
59+
"chromedriver_path = os.environ.get('chromedriver_path')\n",
60+
"\n",
61+
"# max_download_time before timeout, must be ajusted acording to the file size and internet speed\n",
62+
"max_download_time = os.environ.get('max_download_time')\n",
63+
"\n",
64+
"# The label of the file desired to download.\n",
65+
"# Chose from \"TI.zip\", \"TL.zip\", \"VI.zip\", \"TI.tgz\", \"TL.tgz\", \"VI.tgz, \n",
66+
"# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images\n",
67+
"label = os.environ.get('label')\n"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"id": "794506c5",
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"\n",
78+
"def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label): \n",
79+
" \n",
80+
" # Set Chrome options to automatically download files to the specified directory\n",
81+
" options = webdriver.ChromeOptions()\n",
82+
" prefs = {\n",
83+
" \"download.default_directory\": move_to_dir,\n",
84+
" \"download.prompt_for_download\": False,\n",
85+
" \"download.directory_upgrade\": True,\n",
86+
" \"safebrowsing.enabled\": True\n",
87+
" }\n",
88+
" options.add_experimental_option(\"prefs\", prefs)\n",
89+
"\n",
90+
" # Start a new instance of Chrome web browser\n",
91+
" driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)\n",
92+
" \n",
93+
" # Open the login page\n",
94+
" url_login = r'https://challenge.xviewdataset.org/login'\n",
95+
" driver.get(url_login)\n",
96+
"\n",
97+
" # Find the username and password fields and enter credentials\n",
98+
" username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))\n",
99+
" password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))\n",
100+
" username_field.send_keys(username)\n",
101+
" password_field.send_keys(password)\n",
102+
"\n",
103+
" # Find and click the login button\n",
104+
" login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))\n",
105+
" login_button.click()\n",
106+
" \n",
107+
" # Wait for the page to load after login\n",
108+
" time.sleep(1)\n",
109+
" \n",
110+
" # Open the Download page\n",
111+
" url_download = r'https://challenge.xviewdataset.org/download-links'\n",
112+
" driver.get(url_download)\n",
113+
" \n",
114+
" # Wait for the overlay element to be present\n",
115+
" overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))\n",
116+
"\n",
117+
" # Remove the automaic pop-up overlay \n",
118+
" body_element = driver.find_element_by_tag_name('body')\n",
119+
" body_element.click()\n",
120+
" time.sleep(1)\n",
121+
" \n",
122+
" # Switch between the possible download files\n",
123+
" search_text = \"\"\n",
124+
" match label:\n",
125+
" case \"TI.zip\":\n",
126+
" search_text = '//a[contains(text(), \"Download Training Images (zip)\")]'\n",
127+
" case \"TL.zip\":\n",
128+
" search_text = '//a[contains(text(), \"Download Training Labels (zip)\")]'\n",
129+
" case \"VI.zip\":\n",
130+
" search_text = '//a[contains(text(), \"Download Validation Images (zip)\")]'\n",
131+
" case \"TI.tgz\":\n",
132+
" search_text = '//a[contains(text(), \"Download Training Images (tgz)\")]'\n",
133+
" case \"TL.tgz\":\n",
134+
" search_text = '//a[contains(text(), \"Download Training Labels (tgz)\")]'\n",
135+
" case \"VI.tgz\":\n",
136+
" search_text = '//a[contains(text(), \"Download Validation Images (tgz)\")]'\n",
137+
" case _:\n",
138+
" raise ValueError(\"Error: This is an invalid download option\") \n",
139+
" \n",
140+
" # Wait for the download link to be present\n",
141+
" download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))\n",
142+
" \n",
143+
" # Get the dynamic download link from the href attribute\n",
144+
" download_link = download_link_element.get_attribute('href')\n",
145+
" \n",
146+
" # Download the dataset using the obtained link\n",
147+
" if download_link:\n",
148+
" driver.get(download_link)\n",
149+
" print(\"Dataset download started successfully.\")\n",
150+
" \n",
151+
" # Extract the filename from the download link URL\n",
152+
" parsed_url = urlparse(download_link)\n",
153+
" filename = parsed_url.path.split('/')[-1]\n",
154+
" downloaded_file = os.path.join(move_to_dir, filename)\n",
155+
" print(downloaded_file)\n",
156+
" \n",
157+
" # Check if the download directory exists\n",
158+
" if not os.path.exists(move_to_dir):\n",
159+
" os.makedirs(move_to_dir)\n",
160+
" \n",
161+
" # Wait for the file to be completely downloaded\n",
162+
" start_time = time.time()\n",
163+
" \n",
164+
" while True:\n",
165+
" if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:\n",
166+
" print(\"File downloaded successfully.\")\n",
167+
" break\n",
168+
" elif time.time() - start_time > max_download_time:\n",
169+
" print(\"Error: Maximum wait time exceeded.\")\n",
170+
" break\n",
171+
" else:\n",
172+
" time.sleep(5)\n",
173+
" \n",
174+
" else:\n",
175+
" print(\"Failed to get the download link.\")\n",
176+
"\n",
177+
" # Close the browser\n",
178+
" driver.quit()\n"
179+
]
180+
},
181+
{
182+
"cell_type": "code",
183+
"execution_count": null,
184+
"id": "e7b2f96d",
185+
"metadata": {},
186+
"outputs": [],
187+
"source": [
188+
"login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)"
189+
]
190+
}
191+
],
192+
"metadata": {
193+
"kernelspec": {
194+
"display_name": "Python 3 (ipykernel)",
195+
"language": "python",
196+
"name": "python3"
197+
},
198+
"language_info": {
199+
"codemirror_mode": {
200+
"name": "ipython",
201+
"version": 3
202+
},
203+
"file_extension": ".py",
204+
"mimetype": "text/x-python",
205+
"name": "python",
206+
"nbconvert_exporter": "python",
207+
"pygments_lexer": "ipython3",
208+
"version": "3.11.5"
209+
}
210+
},
211+
"nbformat": 4,
212+
"nbformat_minor": 5
213+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: input-xview-download
5+
spec:
6+
template:
7+
spec:
8+
containers:
9+
- name: input-xview-download
10+
image: docker.io/mdorzweiler/claimed-input-xview-download:0.1
11+
workingDir: /opt/app-root/src/
12+
command: ["/opt/app-root/bin/ipython","claimed_input-Xview-download.ipynb"]
13+
env:
14+
- name: log_level
15+
value: value_of_log_level
16+
- name: username
17+
value: value_of_username
18+
- name: password
19+
value: value_of_password
20+
- name: move_to_dir
21+
value: value_of_move_to_dir
22+
- name: chromedriver_path
23+
value: value_of_chromedriver_path
24+
- name: max_download_time
25+
value: value_of_max_download_time
26+
- name: label
27+
value: value_of_label
28+
restartPolicy: OnFailure
29+
imagePullSecrets:
30+
- name: image_pull_secret
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: input-xview-download
2+
description: "## Xview Dataset Download – CLAIMED V0.1"
3+
4+
inputs:
5+
- {name: log_level, type: String, description: "update log level", default: "INFO"}
6+
- {name: username, type: String, description: "username for the Xview webpage to authorize login"}
7+
- {name: password, type: String, description: "password for the Xview webpage to authorize login"}
8+
- {name: move_to_dir, type: String, description: "move_to_dir the directory where the dataset should be saved"}
9+
- {name: chromedriver_path, type: String, description: "chromedriver_path the directory where the local copy of chromedriver is saved"}
10+
- {name: max_download_time, type: String, description: "max_download_time before timeout, must be ajusted acording to the file size and internet speed"}
11+
- {name: label, type: String, description: "standing for TI=Traning Images, TL=Training Lables, VI=Validation Images"}
12+
13+
14+
outputs:
15+
16+
17+
implementation:
18+
container:
19+
image: docker.io/mdorzweiler/claimed-input-xview-download:0.1
20+
command:
21+
- sh
22+
- -ec
23+
- |
24+
ipython ./claimed_input-Xview-download.ipynb log_level="${0}" username="${1}" password="${2}" move_to_dir="${3}" chromedriver_path="${4}" max_download_time="${5}" label="${6}"
25+
- {inputValue: log_level}
26+
- {inputValue: username}
27+
- {inputValue: password}
28+
- {inputValue: move_to_dir}
29+
- {inputValue: chromedriver_path}
30+
- {inputValue: max_download_time}
31+
- {inputValue: label}

0 commit comments

Comments
 (0)