o
    p̦i$                     @   s\   d Z ddlZddlZddlZdgZeddZG dd dZG dd dZ	G d	d
 d
Z
dS )a%   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
    NRobotFileParserRequestRatezrequests secondsc                   @   sr   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd ZdS )r   zs This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

     c                 C   s2   g | _ g | _d | _d| _d| _| | d| _d S )NFr   )entriessitemapsdefault_entrydisallow_all	allow_allset_urllast_checkedselfurl r   )/usr/lib/python3.10/urllib/robotparser.py__init__   s   

zRobotFileParser.__init__c                 C   s   | j S )zReturns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        )r   r   r   r   r   mtime%   s   zRobotFileParser.mtimec                 C   s   ddl }|  | _dS )zYSets the time the robots.txt file was last fetched to the
        current time.

        r   N)timer   )r   r   r   r   r   modified.   s   zRobotFileParser.modifiedc                 C   s&   || _ tj|dd \| _| _dS )z,Sets the URL referring to a robots.txt file.      N)r   urllibparseurlparsehostpathr   r   r   r   r
   6   s    zRobotFileParser.set_urlc              
   C   s   z	t j| j}W n@ t jjyI } z2|jdv rd| _n|jdkr0|jdk r>d| _W Y d}~dS W Y d}~dS W Y d}~dS W Y d}~dS d}~ww |	 }| 
|d  dS )z4Reads the robots.txt URL and feeds it to the parser.)i  i  Ti  i  Nzutf-8)r   requesturlopenr   error	HTTPErrorcoder   r	   readr   decode
splitlines)r   ferrrawr   r   r   r"   ;   s   
zRobotFileParser.readc                 C   s2   d|j v r| jd u r|| _d S d S | j| d S N*)
useragentsr   r   append)r   entryr   r   r   
_add_entryH   s
   


zRobotFileParser._add_entryc                 C   sJ  d}t  }|   |D ]
}|s(|dkrt  }d}n|dkr(| | t  }d}|d}|dkr7|d| }| }|s>q|dd}t|dkr|d   |d< tj	
|d  |d< |d dkr~|dkrs| | t  }|j|d  d}q|d dkr|dkr|jt|d d	 d}q|d d
kr|dkr|jt|d d d}q|d dkr|dkr|d   rt|d |_d}q|d dkr|dkr|d d}t|dkr|d   r|d   rtt|d t|d |_d}q|d dkr| j|d  q|dkr#| | dS dS )zParse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        r   r      #N:z
user-agentdisallowFallowTzcrawl-delayzrequest-rate/sitemap)Entryr   r-   findstripsplitlenlowerr   r   unquoter*   r+   	rulelinesRuleLineisdigitintdelayr   req_rater   )r   linesstater,   lineinumbersr   r   r   r   Q   sv   





 
zRobotFileParser.parsec                 C   s   | j rdS | jr
dS | jsdS tjtj|}tjdd|j|j	|j
|jf}tj|}|s3d}| jD ]}||rD||  S q6| jrN| j|S dS )z=using the parsed robots.txt decide if useragent can fetch urlFTr   r3   )r   r	   r   r   r   r   r;   
urlunparser   paramsqueryfragmentquoter   
applies_to	allowancer   )r   	useragentr   
parsed_urlr,   r   r   r   	can_fetch   s(   

zRobotFileParser.can_fetchc                 C   >   |   sd S | jD ]}||r|j  S q	| jr| jjS d S N)r   r   rL   r@   r   r   rN   r,   r   r   r   crawl_delay      


zRobotFileParser.crawl_delayc                 C   rQ   rR   )r   r   rL   rA   r   rS   r   r   r   request_rate   rU   zRobotFileParser.request_ratec                 C   s   | j sd S | j S rR   )r   r   r   r   r   	site_maps   s   zRobotFileParser.site_mapsc                 C   s,   | j }| jd ur|| jg }dtt|S )Nz

)r   r   joinmapstr)r   r   r   r   r   __str__   s   
zRobotFileParser.__str__N)r   )__name__
__module____qualname____doc__r   r   r   r
   r"   r-   r   rP   rT   rV   rW   r[   r   r   r   r   r      s    
			I

c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r=   zoA rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c                 C   s<   |dkr|sd}t jt j|}t j|| _|| _d S )Nr   T)r   r   rG   r   rK   r   rM   )r   r   rM   r   r   r   r      s
   
zRuleLine.__init__c                 C   s   | j dkp
|| j S r(   )r   
startswith)r   filenamer   r   r   rL      s   zRuleLine.applies_toc                 C   s   | j rdndd | j S )NAllowDisallowz: )rM   r   r   r   r   r   r[      s   zRuleLine.__str__N)r\   r]   r^   r_   r   rL   r[   r   r   r   r   r=      s
    r=   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )r5   z?An entry has one or more user-agents and zero or more rulelinesc                 C   s   g | _ g | _d | _d | _d S rR   )r*   r<   r@   rA   r   r   r   r   r      s   
zEntry.__init__c                 C   s   g }| j D ]
}|d|  q| jd ur|d| j  | jd ur3| j}|d|j d|j  |tt| j	 d
|S )NzUser-agent: zCrawl-delay: zRequest-rate: r3   
)r*   r+   r@   rA   requestssecondsextendrY   rZ   r<   rX   )r   retagentrater   r   r   r[      s   



zEntry.__str__c                 C   sF   | dd  }| jD ]}|dkr dS | }||v r  dS qdS )z2check if this entry applies to the specified agentr3   r   r)   TF)r8   r:   r*   )r   rN   ri   r   r   r   rL      s   
zEntry.applies_toc                 C   s$   | j D ]}||r|j  S qdS )zZPreconditions:
        - our agent applies to this entry
        - filename is URL decodedT)r<   rL   rM   )r   ra   rD   r   r   r   rM   
  s
   


zEntry.allowanceN)r\   r]   r^   r_   r   r[   rL   rM   r   r   r   r   r5      s    r5   )r_   collectionsurllib.parser   urllib.request__all__
namedtupler   r   r=   r5   r   r   r   r   <module>   s     B