
    iF6                         d dl mZmZmZmZ ddlmZmZmZm	Z	m
Z
mZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ d dlZd	ed
efdZdeee                  d
ee	         fdZdeeef         d
eeef         fdZded	ed
efdZ	 d dddededee         dee          d
ef
dZ!dddededee          d
efdZ"	 d dddededee	         dee         dee          d
ee	         fdZ#deded
e$fdZ%ded	e
d
efdZ&deded
efdZ'ded
efdZ(dS )!    )OptionalDictAnyList   )CrawlRequestCrawlJobCrawlResponseDocumentCrawlParamsRequestCrawlParamsDataWebhookConfigCrawlErrorsResponseActiveCrawlsResponseActiveCrawlPaginationConfig)handle_response_error)prepare_scrape_options)AsyncHttpClient)normalize_document_inputNrequestreturnc                 ^   | j         r| j                                         st          d          d| j         i}| j        r
| j        |d<   | j        t          | j                  }|r||d<   | j        Ct          | j        t                    r| j        |d<   n| j        	                    d          |d<   | 	                    dd          }|
                    dd            |
                    dd            |
                    d	d            d
ddddddddddddd}|                                D ]!\  }}||v r|
                    |          ||<   "|                    |           t          | dd           2t          t          | d                                                    |d<   |S )NURL cannot be emptyurlpromptscrapeOptionswebhookT)exclude_none)r   exclude_unsetscrape_optionsincludePathsexcludePathsmaxDiscoveryDepthsitemapignoreQueryParametersdeduplicateSimilarURLscrawlEntireDomainallowExternalLinksallowSubdomainsdelaymaxConcurrencyregexOnFullURLzeroDataRetention)include_pathsexclude_pathsmax_discovery_depthr%   ignore_query_parametersdeduplicate_similar_urlscrawl_entire_domainallow_external_linksallow_subdomainsr+   max_concurrencyregex_on_full_urlzero_data_retentionintegration)r   strip
ValueErrorr   r!   r   r   
isinstancestr
model_dumppopitemsupdategetattr)r   dataoptsrequest_datafield_mappingssnakecamels          `/root/.hermes/hermes-agent/venv/lib64/python3.11/site-packages/firecrawl/v2/methods/aio/crawl.py_prepare_crawl_requestrK      s   ; 0gk//11 0.///7;D~ ( X)%g&<== 	)$(D!"gos++ 	L%oDOO%o88d8KKDO%%4t%LLLUD!!!Xt$$$%t,,,''2#:$<2 4-+-2 N ',,.. 2 2uL  &**511DKKKwt,,8!''="A"ABBHHJJ]K    	data_listc                     g }| pg D ]E}t          |t                    r.t          |          }|                    t	          di |           F|S )N )r=   dictr   appendr   )rM   	documentsdoc_data
normalizeds       rJ   _parse_crawl_documentsrU   B   sb     "IO 5 5h%% 	51(;;JX33
33444rL   bodyc                    |                      d          s#t          |                      dd                    |                      d          |                      dd          |                      dd          |                      dd          |                      d	          |                      d
          t          |                      dg                     dS )NsuccesserrorUnknown error occurredstatus	completedr   totalcreditsUsed	expiresAtnextrD   r[   r\   r]   credits_used
expires_atr`   rD   )get	ExceptionrU   )rV   s    rJ   _parse_crawl_status_responserf   K   s    88I E*BCCDDD ((8$$XXk1--'1%%22hh{++  &txx';';<<  rL   clientc                   K   t          |          }|                     d|           d{V }|j        dk    rt          |d           |                                }|                    d          r7t          |                    d          |                    d                    S t          |                    d	d
                    )aV  
    Start a crawl job for a website.
    
    Args:
        client: Async HTTP client instance
        request: CrawlRequest containing URL and options
        
    Returns:
        CrawlResponse with job information
        
    Raises:
        ValueError: If request is invalid
        Exception: If the crawl operation fails to start
    z	/v2/crawlN  zstart crawlrX   idr   )rj   r   rY   rZ   )rK   poststatus_coder   jsonrd   r
   re   )rg   r   payloadresponserV   s        rJ   start_crawlrp   Z   s       %W--G[[g66666666Hs""h666==??Dxx	 EDHHUOODDDD
DHHW&>??
@
@@rL   request_timeoutjob_idpagination_configrr   c          	        K   |                      d| |           d{V }|j        dk    rt          |d           |                                }t	          |          }|d         }|r|j        nd}|r(|d         r t          | |d         |||	           d{V }t          |d
         |d         |d         |d         |d         |s|d         nd|          S )a[  
    Get the status of a crawl job.
    
    Args:
        client: Async HTTP client instance
        job_id: ID of the crawl job
        pagination_config: Optional configuration for pagination limits
        request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination 
            is enabled (default) and there are multiple pages of results, this timeout applies to 
            each page request separately, not to the entire operation
        
    Returns:
        CrawlJob with job information
        
    Raises:
        Exception: If the status check fails
    
/v2/crawl/timeoutNri   zget crawl statusrD   Tr`   rq   r[   r\   r]   rb   rc   ra   )rd   rl   r   rm   rf   auto_paginate_fetch_all_pages_asyncr	   )	rg   rs   rt   rr   ro   rV   rn   rR   ry   s	            rJ   get_crawl_statusr{   s   s<     0 ZZ 5V 5 5ZOOOOOOOOHs""h(:;;;==??D*400GI 8IR%33dM 
 
0FO+
 
 
 
 
 
 
 
 
	 x +&g^,<($1;WV__t   rL   next_urlc          
      <  K   |                      ||           d{V }|j        dk    rt          |d           |                                }t	          |          }t          |d         |d         |d         |d         |d	         |d
         |d                   S )a  
    Fetch a single page of crawl results using the provided next URL.

    Args:
        client: Async HTTP client instance
        next_url: Opaque next URL from a prior crawl status response
        request_timeout: Timeout (in seconds) for the HTTP request

    Returns:
        CrawlJob with the page data and next URL (if any)

    Raises:
        Exception: If the request fails or returns an error response
    rw   Nri   zget crawl status pager[   r\   r]   rb   rc   r`   rD   ra   )rd   rl   r   rm   rf   r	   )rg   r|   rr   ro   rV   rn   s         rJ   get_crawl_status_pager~      s      ( ZZ/ZBBBBBBBBHs""h(?@@@==??D*400Gx +&g^,<(V_V_   rL   initial_documentsc                  K   |                                 }|}d}|r|j        nd}|r|j        nd}	|r|j        nd}
t	          j                    }|r|||k    rn|
t	          j                    |z
  |
k    rn|                     ||           d{V }|j        dk    r8ddl}|	                    d          }|
                    dd|j        i           n|                                }	 t          |          }n# t          $ r Y naw xY w|d	         D ].}|	t          |          |	k    r n|                    |           /|	t          |          |	k    rn|d
         }|dz  }||S )a  
    Fetch all pages of crawl results asynchronously.
    
    Args:
        client: Async HTTP client instance
        next_url: URL for the next page
        initial_documents: Documents from the first page
        pagination_config: Optional configuration for pagination limits
        request_timeout: Optional timeout (in seconds) for the underlying HTTP request
        
    Returns:
        List of all documents from all pages
    r   Nrw   ri   	firecrawlzFailed to fetch next pagerl   )extrarD   r`      )copy	max_pagesmax_resultsmax_wait_timetime	monotonicrd   rl   logging	getLoggerwarningrm   rf   re   lenrQ   )rg   r|   r   rt   rr   rR   current_url
page_countr   r   r   
start_timero   r   logger	page_datapage_payloaddocuments                     rJ   rz   rz      s     * "&&((IKJ 0AJ!++dI3DN#//$K7HR%33dM!!J
 %!zY'>'>%DN,<,<z,I]+Z+Z  KIIIIIIII3&&NNN&&{33FNN6}hNb>cNdddMMOO		7	BBLL 	 	 	E	 %V, 	' 	'H'c)nn.K.KX&&&& ##i..K*G*G #6*a
K  %N s   .C> >
D
Dc                    K   |                      d|            d{V }|j        dk    rt          |d           |                                }|                    d          dk    S )a	  
    Cancel a crawl job.
    
    Args:
        client: Async HTTP client instance
        job_id: ID of the crawl job
        
    Returns:
        True if cancellation was successful
        
    Raises:
        Exception: If the cancellation operation fails
    rv   Nri   zcancel crawlr[   	cancelled)deleterl   r   rm   rd   )rg   rs   ro   rV   s       rJ   cancel_crawlr     sv       ]]#8#8#899999999Hs""h777==??D88H,,rL   c                   K   |j         r|j                                         st          d          |j        r|j                                        st          d          |j         |j        d}|                     d|           d{V }|j        dk    rt          |d           |                                }|                    d          s#t          |                    d	d
                    |                    di           }i }ddddddddddddd}|
                                D ]\  }}	||v r||         ||	<   d|v r|d         }
|
|d<   d|v r|d         |d<   t          di |S )au  
    Preview crawl parameters before starting a crawl job.
    
    Args:
        client: Async HTTP client instance
        request: CrawlParamsRequest containing URL and prompt
        
    Returns:
        CrawlParamsData containing crawl configuration
        
    Raises:
        ValueError: If request is invalid
        Exception: If the parameter preview fails
    r   zPrompt cannot be empty)r   r   z/v2/crawl/params-previewNri   zcrawl params previewrX   rY   rZ   rD   r/   r0   r1   r%   r2   r3   r4   r5   r6   r7   r!   r9   )r"   r#   r$   r%   r&   r'   r(   r)   r*   r,   r   r.   r   r   rO   )r   r;   r<   r   rk   rl   r   rm   rd   re   rA   r   )rg   r   rn   ro   rV   params_data	convertedmappingrI   rH   wks              rJ   crawl_params_previewr   ,  s      ; 0gk//11 0.///> 3!5!5!7!7 31222kW^<<G[[!;WEEEEEEEEHs""h(>???==??D88I E*BCCDDD((62&&K "I''2!:"<24-+)2 G   2 2uK*51IeK#!	)D#I	)''Y'''rL   crawl_idc                 h  K   |                      d| d           d{V }|j        dk    rt          |d           |                                }|                     d|          }|                     dg           |                     d|                     d	g                     d
}t	          di |S )a"  
    Get errors from a crawl job.
    
    Args:
        client: Async HTTP client instance
        crawl_id: ID of the crawl job
        
    Returns:
        CrawlErrorsResponse with errors and robots blocked
        
    Raises:
        Exception: If the error check operation fails
    rv   z/errorsNri   zcheck crawl errorsrD   errorsrobotsBlockedrobots_blocked)r   r   rO   )rd   rl   r   rm   r   )rg   r   ro   rV   rn   rT   s         rJ   get_crawl_errorsr   a  s       ZZ >X > > >????????Hs""h(<=====??Dhhvt$$G++h++!++ow{{CSUW7X7XYY J ,,,,,rL   c           
        K   |                      d           d{V }|j        dk    rt          |d           |                                }|                     d          s#t	          |                     dd                    |                     dg           }g }|D ]}t          |t                    rz|                    |                     d	          |                     d
|                     d                    |                     d          |                     d          d           t          dd |D                       S )z
    Get active crawl jobs.
    
    Args:
        client: Async HTTP client instance
        
    Returns:
        ActiveCrawlsResponse with active crawl jobs
        
    Raises:
        Exception: If the active crawl jobs operation fails
    z/v2/crawl/activeNri   zget active crawlsrX   rY   rZ   crawlsrj   teamIdteam_idr   options)rj   r   r   r   Tc                 &    g | ]}t          d i |S )rO   )r   ).0ncs     rJ   
<listcomp>z%get_active_crawls.<locals>.<listcomp>  s&    5]5]5]Bk6G6GB6G6G5]5]5]rL   )rX   r   )	rd   rl   r   rm   re   r=   rP   rQ   r   )rg   ro   rV   	crawls_inrT   cs         rJ   get_active_crawlsr   {  sQ      ZZ 233333333Hs""h(;<<<==??D88I E*BCCDDD2&&IJ  a 	eeDkk55155+;+;<<uuU||55++	      5]5]R\5]5]5]^^^^rL   )N))typingr   r   r   r   typesr   r	   r
   r   r   r   r   r   r   r   r   utils.error_handlerr   utils.validationr   utils.http_client_asyncr   utils.normalizer   r   rP   rK   rU   r>   rf   rp   floatr{   r~   rz   boolr   r   r   r   rO   rL   rJ   <module>r      s   , , , , , , , , , , , ,                          9 8 8 8 8 8 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 )L )T ) ) ) )XhtCy&9 d8n    tCH~ $sCx.    Ao A A A A A A8 593
 (,3 3 333   013
 e_3 3 3 3 3t (,	! ! !!! e_	!
 ! ! ! !P 59	G (,G G GGG H~G   01	G e_G 
(^G G G GT- - - - - - -*2( 2(AS 2(Xg 2( 2( 2( 2(j-? -c -FY - - - -4_O _8L _ _ _ _ _ _rL   