@@ -16,6 +16,11 @@ class OneDriveClientException(Exception):
1616 pass
1717
1818
19+ class OneDriveTransientException (OneDriveClientException ):
20+ """Raised for transient HTTP errors (429, 5xx) that are safe to retry."""
21+ pass
22+
23+
1924class OneDriveClient (HttpClient ):
2025 """
2126 The OneDriveClient class manages the interaction with OneDrive API. It handles tasks such as
@@ -74,15 +79,15 @@ def _configure_client(self):
7479 def _configure_onedrive_client (self ):
7580 logging .info ("Initializing OneDrive client" )
7681 self .client_type = "OneDrive"
77- self .authority = 'https://login.microsoftonline.com/common'
82+ self .auth_url = 'https://login.microsoftonline.com/common'
7883 self .base_url = 'https://graph.microsoft.com/v1.0/me'
7984 self .scope = 'User.Read Files.Read.All offline_access'
8085 self ._get_request_tokens ()
8186
8287 def _configure_sharepoint_client (self ):
8388 logging .info ("Initializing Sharepoint client" )
8489 self .client_type = "Sharepoint"
85- self .authority = f'https://login.microsoftonline.com/{ self .tenant_id } '
90+ self .auth_url = f'https://login.microsoftonline.com/{ self .tenant_id } '
8691 self .scope = 'Sites.Read.All Files.Read.All offline_access'
8792 self .base_url = 'https://graph.microsoft.com/v1.0/sites/'
8893 # We need access token to get site id and url
@@ -93,7 +98,7 @@ def _configure_sharepoint_client(self):
9398 def _configure_onedrive_for_business_client (self ):
9499 logging .info ("Initializing OneDriveForBusiness client" )
95100 self .client_type = "OneDriveForBusiness"
96- self .authority = f'https://login.microsoftonline.com/{ self .tenant_id } '
101+ self .auth_url = f'https://login.microsoftonline.com/{ self .tenant_id } '
97102 self .base_url = 'https://graph.microsoft.com/v1.0/me/drive'
98103 self .scope = 'Sites.Read.All Files.Read.All offline_access'
99104 self ._get_request_tokens ()
@@ -102,8 +107,8 @@ def _get_request_tokens(self) -> None:
102107 """
103108 This is handled using requests to handle compatibility with OneDrive and Sharepoint client.
104109 """
105- logging .info ( "Fetching New Access token" )
106- request_url = "https://login.microsoftonline.com/common /oauth2/v2.0/token"
110+ logging .debug ( f "Fetching new access token from { self . auth_url } " )
111+ request_url = f" { self . auth_url } /oauth2/v2.0/token"
107112 headers = {"Content-Type" : "application/x-www-form-urlencoded" }
108113 payload = {
109114 "client_id" : self .client_id ,
@@ -117,11 +122,16 @@ def _get_request_tokens(self) -> None:
117122
118123 token = response .json ().get ("access_token" , None )
119124 if not token :
120- logging .error (response .json ())
121- raise OneDriveClientException ("Authentication failed, "
122- "reauthorize the extractor in extractor configuration." )
123-
124- logging .info ("New Access token fetched." )
125+ error_response = response .json ()
126+ error_code = error_response .get ("error" , "unknown" )
127+ error_description = error_response .get ("error_description" , "No error description provided" )
128+ logging .error (f"Token refresh failed (HTTP { response .status_code } ): { error_code } - { error_description } " )
129+ raise OneDriveClientException (
130+ f"Authentication failed (HTTP { response .status_code } ): { error_code } - { error_description } . "
131+ f"Reauthorize the extractor in extractor configuration."
132+ )
133+
134+ logging .debug ("New access token fetched." )
125135 self .access_token = token
126136 self ._refresh_token = response .json ()["refresh_token" ]
127137
@@ -133,19 +143,35 @@ def refresh_token(self):
133143 return self ._refresh_token
134144
135145 def get_request (self , url : str , is_absolute_path : bool , stream : bool = False ):
136- response = self .get_raw (url , is_absolute_path = is_absolute_path , stream = stream )
137- if response .status_code == 200 :
138- return response
139- elif response .status_code == 401 :
140- self ._get_request_tokens ()
141- return self .get_request (url , is_absolute_path , stream )
142- elif response .status_code == 404 :
143- logging .error (f"Url { url } returned 404." )
144- return None
145- else :
146- raise OneDriveClientException (f"Cannot fetch { url } , "
147- f"response: { response .text } , "
148- f"status_code: { response .status_code } " )
146+ url_path = urlparse (url ).path
147+ for attempt in range (2 ):
148+ response = self .get_raw (url , is_absolute_path = is_absolute_path , stream = stream )
149+ if response .status_code == 200 :
150+ return response
151+ elif response .status_code == 404 :
152+ logging .error (f"URL { url_path } returned 404." )
153+ return None
154+ elif response .status_code == 401 :
155+ if attempt == 0 :
156+ logging .warning (f"Got 401 fetching { url_path } , refreshing token and retrying..." )
157+ self ._get_request_tokens ()
158+ continue
159+ error_body = response .json ()
160+ error_code = error_body .get ("error" , "unknown" )
161+ error_desc = error_body .get ("error_description" , "no description" )
162+ raise OneDriveClientException (
163+ f"Authentication failed after token refresh (HTTP 401): "
164+ f"{ error_code } - { error_desc } "
165+ )
166+ elif response .status_code in (429 , 500 , 502 , 503 , 504 ):
167+ raise OneDriveTransientException (
168+ f"Transient error fetching { url_path } : HTTP { response .status_code } - { response .text } "
169+ )
170+ else :
171+ raise OneDriveClientException (
172+ f"Cannot fetch { url_path } , response: { response .text } , "
173+ f"status_code: { response .status_code } "
174+ )
149175
150176 def _resolve_folder_id (self , drive_type : str , folder_path : str ):
151177 if folder_path is None or folder_path == '/' :
@@ -273,14 +299,16 @@ def get_site_id_from_url(self, site_url: str):
273299 url = f"https://graph.microsoft.com/v1.0/sites/{ hostname } :{ server_relative_path } "
274300 headers = {"Authorization" : 'Bearer ' + self .access_token }
275301
302+ logging .info (f"Resolving site URL '{ site_url } ' via Graph API" )
276303 response = requests .get (url , headers = headers )
277304
278305 if response .status_code == 200 :
279306 site = response .json ()
280307 site_id = site ['id' ]
308+ logging .info (f"Resolved site ID: { site_id } " )
281309 return site_id
282310 else :
283- raise OneDriveClientException (f"Error occurred when fetching site information: "
311+ raise OneDriveClientException (f"Error occurred when fetching site information for ' { site_url } ' : "
284312 f"{ response .status_code } , { response .text } " )
285313
286314 def _get_sharepoint_document_libraries (self ):
@@ -295,12 +323,12 @@ def _get_sharepoint_document_libraries(self):
295323 raise OneDriveClientException (f"Error occurred when getting SharePoint document libraries:"
296324 f" { response .status_code } , { response .text } " )
297325
298- @backoff .on_exception (backoff .expo , Exception , max_tries = MAX_RETRIES )
326+ @backoff .on_exception (backoff .expo , OneDriveTransientException , max_tries = MAX_RETRIES )
299327 def _download_file_from_onedrive_url (self , url , output_path , filename ):
300328 """
301329 Downloads a file from OneDrive using the provided download URL and saves it to the specified output path.
302330 """
303- with self .get_request (url , is_absolute_path = True , stream = True ) as r :
331+ with self .get_raw (url , is_absolute_path = True , stream = True , ignore_auth = True ) as r :
304332
305333 if r is None :
306334 self ._handle_no_response (filename )
@@ -400,7 +428,10 @@ def get_document_libraries(self, site_url):
400428 try :
401429 response .raise_for_status ()
402430 except HTTPError as e :
403- raise OneDriveClientException (f"Cannot get document libraries for site_url: { site_url } " ) from e
431+ raise OneDriveClientException (
432+ f"Cannot get document libraries for site URL '{ site_url } ': "
433+ f"{ response .status_code } , { response .text } "
434+ ) from e
404435
405436 return response .json ()['value' ]
406437
@@ -410,6 +441,13 @@ def download_files(self, file_path, output_dir, last_modified_at=None, library_n
410441 folder_path , mask = self ._split_path_mask (file_path )
411442 logging .info (f"Downloading files matching mask { mask } from folder { folder_path } " )
412443 items = self ._get_items_based_on_client_type (folder_path , library_name )
444+ files = [i for i in items if i .get ("file" )]
445+ folders = [i for i in items if i .get ("folder" )]
446+ unknown = len (items ) - len (files ) - len (folders )
447+ breakdown = f"{ len (files )} files, { len (folders )} subfolders"
448+ if unknown :
449+ breakdown += f", { unknown } unknown"
450+ logging .info (f"Found { len (items )} items ({ breakdown } ) in '{ folder_path } '" )
413451 folder_mask = self ._create_folder_mask (mask , folder_path )
414452 self ._process_items (items , folder_mask , mask , folder_path , output_dir , last_modified_at , library_name )
415453
0 commit comments