Skip to content

Commit c52b977

Browse files
authored
Merge pull request #7 from keboola/devin/1777018079-improve-error-logging
fix: OAuth token refresh, infinite recursion, and pre-signed URL auth in OneDrive client
2 parents d716fd6 + efc2aaf commit c52b977

3 files changed

Lines changed: 83 additions & 38 deletions

File tree

.github/workflows/push.yml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
name: Keboola Component Build & Deploy Pipeline
22
on:
33
push:
4-
branches:
5-
- 'feature/*'
6-
- 'bug/*'
4+
branches-ignore:
5+
- main
76
tags:
8-
- '*' # Skip the workflow on the main branch without tags
7+
- "*" # Skip the workflow on the main branch without tags
8+
workflow_dispatch:
99

1010
concurrency: ci-${{ github.ref }} # to avoid tag collisions in the ECR
1111
env:
@@ -47,7 +47,7 @@ jobs:
4747
echo "branch_name=$branch_name" | tee -a $GITHUB_OUTPUT
4848
else
4949
raw=$(git branch -r --contains ${{ github.ref }})
50-
branch="$(echo $raw | sed "s/.*origin\///" | tr -d '\n')"
50+
branch="$(echo ${raw/*origin\//} | tr -d '\n')"
5151
echo "branch_name=$branch" | tee -a $GITHUB_OUTPUT
5252
fi
5353
@@ -64,8 +64,13 @@ jobs:
6464
- name: Set image tag
6565
id: tag
6666
run: |
67-
TAG="${GITHUB_REF##*/}"
68-
IS_SEMANTIC_TAG=$(echo "$TAG" | grep -q '^v\?[0-9]\+\.[0-9]\+\.[0-9]\+$' && echo true || echo false)
67+
REF="${GITHUB_REF##*/}"
68+
if [ "${{ github.ref_type }}" = "tag" ]; then
69+
TAG="$REF"
70+
else
71+
TAG="$REF-${{ github.run_number }}"
72+
fi
73+
IS_SEMANTIC_TAG=$(echo "$REF" | grep -q '^[0-9]\+\.[0-9]\+\.[0-9]\+$' && echo true || echo false)
6974
echo "is_semantic_tag=$IS_SEMANTIC_TAG" | tee -a $GITHUB_OUTPUT
7075
echo "app_image_tag=$TAG" | tee -a $GITHUB_OUTPUT
7176

src/client/client.py

Lines changed: 65 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ class OneDriveClientException(Exception):
1616
pass
1717

1818

19+
class OneDriveTransientException(OneDriveClientException):
20+
"""Raised for transient HTTP errors (429, 5xx) that are safe to retry."""
21+
pass
22+
23+
1924
class OneDriveClient(HttpClient):
2025
"""
2126
The OneDriveClient class manages the interaction with OneDrive API. It handles tasks such as
@@ -74,15 +79,15 @@ def _configure_client(self):
7479
def _configure_onedrive_client(self):
7580
logging.info("Initializing OneDrive client")
7681
self.client_type = "OneDrive"
77-
self.authority = 'https://login.microsoftonline.com/common'
82+
self.auth_url = 'https://login.microsoftonline.com/common'
7883
self.base_url = 'https://graph.microsoft.com/v1.0/me'
7984
self.scope = 'User.Read Files.Read.All offline_access'
8085
self._get_request_tokens()
8186

8287
def _configure_sharepoint_client(self):
8388
logging.info("Initializing Sharepoint client")
8489
self.client_type = "Sharepoint"
85-
self.authority = f'https://login.microsoftonline.com/{self.tenant_id}'
90+
self.auth_url = f'https://login.microsoftonline.com/{self.tenant_id}'
8691
self.scope = 'Sites.Read.All Files.Read.All offline_access'
8792
self.base_url = 'https://graph.microsoft.com/v1.0/sites/'
8893
# We need access token to get site id and url
@@ -93,7 +98,7 @@ def _configure_sharepoint_client(self):
9398
def _configure_onedrive_for_business_client(self):
9499
logging.info("Initializing OneDriveForBusiness client")
95100
self.client_type = "OneDriveForBusiness"
96-
self.authority = f'https://login.microsoftonline.com/{self.tenant_id}'
101+
self.auth_url = f'https://login.microsoftonline.com/{self.tenant_id}'
97102
self.base_url = 'https://graph.microsoft.com/v1.0/me/drive'
98103
self.scope = 'Sites.Read.All Files.Read.All offline_access'
99104
self._get_request_tokens()
@@ -102,8 +107,8 @@ def _get_request_tokens(self) -> None:
102107
"""
103108
This is handled using requests to handle compatibility with OneDrive and Sharepoint client.
104109
"""
105-
logging.info("Fetching New Access token")
106-
request_url = "https://login.microsoftonline.com/common/oauth2/v2.0/token"
110+
logging.debug(f"Fetching new access token from {self.auth_url}")
111+
request_url = f"{self.auth_url}/oauth2/v2.0/token"
107112
headers = {"Content-Type": "application/x-www-form-urlencoded"}
108113
payload = {
109114
"client_id": self.client_id,
@@ -117,11 +122,16 @@ def _get_request_tokens(self) -> None:
117122

118123
token = response.json().get("access_token", None)
119124
if not token:
120-
logging.error(response.json())
121-
raise OneDriveClientException("Authentication failed, "
122-
"reauthorize the extractor in extractor configuration.")
123-
124-
logging.info("New Access token fetched.")
125+
error_response = response.json()
126+
error_code = error_response.get("error", "unknown")
127+
error_description = error_response.get("error_description", "No error description provided")
128+
logging.error(f"Token refresh failed (HTTP {response.status_code}): {error_code} - {error_description}")
129+
raise OneDriveClientException(
130+
f"Authentication failed (HTTP {response.status_code}): {error_code} - {error_description}. "
131+
f"Reauthorize the extractor in extractor configuration."
132+
)
133+
134+
logging.debug("New access token fetched.")
125135
self.access_token = token
126136
self._refresh_token = response.json()["refresh_token"]
127137

@@ -133,19 +143,35 @@ def refresh_token(self):
133143
return self._refresh_token
134144

135145
def get_request(self, url: str, is_absolute_path: bool, stream: bool = False):
136-
response = self.get_raw(url, is_absolute_path=is_absolute_path, stream=stream)
137-
if response.status_code == 200:
138-
return response
139-
elif response.status_code == 401:
140-
self._get_request_tokens()
141-
return self.get_request(url, is_absolute_path, stream)
142-
elif response.status_code == 404:
143-
logging.error(f"Url {url} returned 404.")
144-
return None
145-
else:
146-
raise OneDriveClientException(f"Cannot fetch {url}, "
147-
f"response: {response.text}, "
148-
f"status_code: {response.status_code}")
146+
url_path = urlparse(url).path
147+
for attempt in range(2):
148+
response = self.get_raw(url, is_absolute_path=is_absolute_path, stream=stream)
149+
if response.status_code == 200:
150+
return response
151+
elif response.status_code == 404:
152+
logging.error(f"URL {url_path} returned 404.")
153+
return None
154+
elif response.status_code == 401:
155+
if attempt == 0:
156+
logging.warning(f"Got 401 fetching {url_path}, refreshing token and retrying...")
157+
self._get_request_tokens()
158+
continue
159+
error_body = response.json()
160+
error_code = error_body.get("error", "unknown")
161+
error_desc = error_body.get("error_description", "no description")
162+
raise OneDriveClientException(
163+
f"Authentication failed after token refresh (HTTP 401): "
164+
f"{error_code} - {error_desc}"
165+
)
166+
elif response.status_code in (429, 500, 502, 503, 504):
167+
raise OneDriveTransientException(
168+
f"Transient error fetching {url_path}: HTTP {response.status_code} - {response.text}"
169+
)
170+
else:
171+
raise OneDriveClientException(
172+
f"Cannot fetch {url_path}, response: {response.text}, "
173+
f"status_code: {response.status_code}"
174+
)
149175

150176
def _resolve_folder_id(self, drive_type: str, folder_path: str):
151177
if folder_path is None or folder_path == '/':
@@ -273,14 +299,16 @@ def get_site_id_from_url(self, site_url: str):
273299
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:{server_relative_path}"
274300
headers = {"Authorization": 'Bearer ' + self.access_token}
275301

302+
logging.info(f"Resolving site URL '{site_url}' via Graph API")
276303
response = requests.get(url, headers=headers)
277304

278305
if response.status_code == 200:
279306
site = response.json()
280307
site_id = site['id']
308+
logging.info(f"Resolved site ID: {site_id}")
281309
return site_id
282310
else:
283-
raise OneDriveClientException(f"Error occurred when fetching site information: "
311+
raise OneDriveClientException(f"Error occurred when fetching site information for '{site_url}': "
284312
f"{response.status_code}, {response.text}")
285313

286314
def _get_sharepoint_document_libraries(self):
@@ -295,12 +323,12 @@ def _get_sharepoint_document_libraries(self):
295323
raise OneDriveClientException(f"Error occurred when getting SharePoint document libraries:"
296324
f" {response.status_code}, {response.text}")
297325

298-
@backoff.on_exception(backoff.expo, Exception, max_tries=MAX_RETRIES)
326+
@backoff.on_exception(backoff.expo, OneDriveTransientException, max_tries=MAX_RETRIES)
299327
def _download_file_from_onedrive_url(self, url, output_path, filename):
300328
"""
301329
Downloads a file from OneDrive using the provided download URL and saves it to the specified output path.
302330
"""
303-
with self.get_request(url, is_absolute_path=True, stream=True) as r:
331+
with self.get_raw(url, is_absolute_path=True, stream=True, ignore_auth=True) as r:
304332

305333
if r is None:
306334
self._handle_no_response(filename)
@@ -400,7 +428,10 @@ def get_document_libraries(self, site_url):
400428
try:
401429
response.raise_for_status()
402430
except HTTPError as e:
403-
raise OneDriveClientException(f"Cannot get document libraries for site_url: {site_url}") from e
431+
raise OneDriveClientException(
432+
f"Cannot get document libraries for site URL '{site_url}': "
433+
f"{response.status_code}, {response.text}"
434+
) from e
404435

405436
return response.json()['value']
406437

@@ -410,6 +441,13 @@ def download_files(self, file_path, output_dir, last_modified_at=None, library_n
410441
folder_path, mask = self._split_path_mask(file_path)
411442
logging.info(f"Downloading files matching mask {mask} from folder {folder_path}")
412443
items = self._get_items_based_on_client_type(folder_path, library_name)
444+
files = [i for i in items if i.get("file")]
445+
folders = [i for i in items if i.get("folder")]
446+
unknown = len(items) - len(files) - len(folders)
447+
breakdown = f"{len(files)} files, {len(folders)} subfolders"
448+
if unknown:
449+
breakdown += f", {unknown} unknown"
450+
logging.info(f"Found {len(items)} items ({breakdown}) in '{folder_path}'")
413451
folder_mask = self._create_folder_mask(mask, folder_path)
414452
self._process_items(items, folder_mask, mask, folder_path, output_dir, last_modified_at, library_name)
415453

src/component.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,17 +91,19 @@ def _init_configuration(self) -> None:
9191
def _get_client(self, account_params: Account) -> OneDriveClient:
9292
tenant_id = account_params.tenant_id
9393
site_url = account_params.site_url
94+
last_error = None
9495
for refresh_token in self._get_refresh_tokens():
9596
try:
9697
client = OneDriveClient(refresh_token=refresh_token, files_out_folder=self.files_out_path,
9798
client_id=self.client_id, client_secret=self.client_secret,
9899
tenant_id=tenant_id, site_url=site_url)
99100
self._save_refresh_token_state(client.refresh_token)
100101
return client
101-
except OneDriveClientException:
102-
logging.warning("Refresh token failed, retrying connection with new refresh token.")
103-
pass
104-
raise UserException('Authentication failed, reauthorize the extractor in extractor configuration!')
102+
except OneDriveClientException as e:
103+
last_error = e
104+
logging.warning(f"Refresh token failed: {e}")
105+
raise UserException(str(last_error)) if last_error else \
106+
UserException("Authentication failed, reauthorize the extractor in extractor configuration!")
105107

106108
def _get_refresh_tokens(self) -> list[str]:
107109
state_file = self.get_state_file()

0 commit comments

Comments
 (0)