o
    p̦iG                     @   s   d Z ddlZddlZddlmZ dgZedZedZedZ	edZ
ed	Zed
ZedZedZedZedZedejZedZedZG dd dejZdS )zA parser for HTML and XHTML.    N)unescape
HTMLParserz[&<]z
&[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]z
</[a-zA-Z]>z--\s*>z+([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF  
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
        \s*                          # possibly followed by a space
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c                   @   s   e Zd ZdZdZddddZdd Zd	d
 Zdd ZdZ	dd Z
dd Zdd Zdd Zdd Zd7ddZdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 ZdS )8r   aE  Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  If convert_charrefs is
    True the character references are converted automatically to the
    corresponding Unicode character (and self.handle_data() is no
    longer split in chunks), otherwise they are passed by calling
    self.handle_entityref() or self.handle_charref() with the string
    containing respectively the named or numeric reference as the
    argument.
    )scriptstyleT)convert_charrefsc                C   s   || _ |   dS )zInitialize and reset this instance.

        If convert_charrefs is True (the default), all character references
        are automatically converted to the corresponding Unicode characters.
        N)r   reset)selfr    r
   "/usr/lib/python3.10/html/parser.py__init__W   s   zHTMLParser.__init__c                 C   s(   d| _ d| _t| _d| _tj|  dS )z1Reset this instance.  Loses all unprocessed data. z???N)rawdatalasttaginteresting_normalinteresting
cdata_elem_markupbase
ParserBaser   r	   r
   r
   r   r   `   s
   zHTMLParser.resetc                 C   s   | j | | _ | d dS )zFeed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        r   N)r   goaheadr	   datar
   r
   r   feedh   s   zHTMLParser.feedc                 C   s   |  d dS )zHandle any buffered data.   N)r   r   r
   r
   r   closeq   s   zHTMLParser.closeNc                 C   s   | j S )z)Return full source of start tag: '<...>'.)_HTMLParser__starttag_textr   r
   r
   r   get_starttag_textw   s   zHTMLParser.get_starttag_textc                 C   s$   |  | _td| j tj| _d S )Nz</\s*%s\s*>)lowerr   recompileIr   )r	   elemr
   r
   r   set_cdata_mode{   s   
zHTMLParser.set_cdata_modec                 C   s   t | _d | _d S N)r   r   r   r   r
   r
   r   clear_cdata_mode   s   
zHTMLParser.clear_cdata_modec                 C   s  | j }d}t|}||k rU| jr;| js;|d|}|dk r:|dt||d }|dkr8td	||s8n|}n| j
	||}|rI| }n| jrNn|}||k ro| jrf| jsf| t|||  n	| |||  | ||}||kr{n|j}|d|rt||r| |}	n@|d|r| |}	n5|d|r| |}	n*|d|r| |}	n|d	|r| |}	n|d
 |k s|r| d |d
 }	nn|	dk r|sِn|t||rn|d|r|d |kr| d nt||rn| ||d d   n~|d|r0|}dD ]}
||
|d r"|t|
8 } nq| ||d |  nS|d|rB| ||d d   nA|||d   dkr[| ||d d   n(|d	|rm| ||d d   n|d|r| ||d d   ntd|}	| ||	}n|d|rt||}|r|  dd }| !| |" }	|d|	d
 s|	d
 }	| ||	}q	d||d  v r| |||d   | ||d }ny|d|rMt#||}|r| d
}| $| |" }	|d|	d
 s|	d
 }	| ||	}q	t%||}|r7|r6|  ||d  kr6|" }	|	|kr.|}	| ||d
 }n|d
 |k rL| d | ||d
 }nnJ d||k s|r||k r| js| jru| jsu| t|||  n	| |||  | ||}||d  | _ d S )Nr   <&"   z[\s;]</<!--<?<!r      )z--!z---   z	<![CDATA[   	   	<!doctypewe should not get here!z&#;zinteresting.search() lied)&r   lenr   r   findrfindmaxr   r    searchr   starthandle_datar   	updatepos
startswithstarttagopenmatchparse_starttagparse_endtagparse_commentparse_piparse_html_declaration
endtagopenhandle_commentendswithunknown_declr   handle_decl	handle_piAssertionErrorcharrefgrouphandle_charrefend	entityrefhandle_entityref
incomplete)r	   rP   r   injampposr@   r>   ksuffixnamer
   r
   r   r      s   













}zHTMLParser.goaheadc                 C   s   | j }|||d  dksJ d|||d  dkr | |S |||d  dkr/| |S |||d   d	krX|d
|d }|dkrIdS | ||d |  |d S | |S )Nr-   r,   z+unexpected call to parse_html_declaration()r/   r*   r0   z<![r1   r2   r   r4   r   )r   rC   parse_marked_sectionr   r7   rJ   parse_bogus_comment)r	   rT   r   gtposr
   r
   r   rE     s   


z!HTMLParser.parse_html_declarationr   c                 C   s`   | j }|||d  dv sJ d|d|d }|dkrdS |r,| ||d |  |d S )Nr-   )r,   r)   z"unexpected call to parse_comment()r   r4   r   )r   r7   rG   )r	   rT   reportr   posr
   r
   r   r\   '  s   zHTMLParser.parse_bogus_commentc                 C   sd   | j }|||d  dksJ dt||d }|sdS | }| ||d |  | }|S )Nr-   r+   zunexpected call to parse_pi()r4   )r   picloser:   r;   rK   rP   )r	   rT   r   r@   rV   r
   r
   r   rD   3  s   zHTMLParser.parse_pic                 C   s  d | _ | |}|dk r|S | j}||| | _ g }t||d }|s(J d| }|d  | _}||k rt	||}|sCnS|ddd\}	}
}|
sRd }n-|d d d  krd|dd  ksyn |d d d  krw|dd  krn n|dd }|rt
|}||	 |f | }||k s:|||  }|d	vr|  \}}d
| j v r|| j d
 }t| j | j d
 }n|t| j  }| |||  |S |dr| || |S | || || jv r| | |S )Nr   r   z#unexpected call to parse_starttag()r-   r0   'r4   ")r   />
rc   )r   check_for_whole_start_tagr   tagfind_tolerantr@   rP   rN   r   r   attrfind_tolerantr   appendstripgetposcountr6   r8   r<   rH   handle_startendtaghandle_starttagCDATA_CONTENT_ELEMENTSr#   )r	   rT   endposr   attrsr@   rX   tagmattrnamerest	attrvaluerP   linenooffsetr
   r
   r   rA   ?  sX   
&(




zHTMLParser.parse_starttagc                 C   s   | j }t||}|rU| }|||d  }|dkr|d S |dkr?|d|r-|d S |d|r5dS ||kr;|S |d S |dkrEdS |dv rKdS ||krQ|S |d S td	)
Nr   r   /rc   r-   r4   r   z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZr3   )r   locatestarttagend_tolerantr@   rP   r>   rL   )r	   rT   r   rr   rV   nextr
   r
   r   re   r  s.   z$HTMLParser.check_for_whole_start_tagc                 C   s*  | j }|||d  dksJ dt||d }|sdS | }t||}|sn| jd ur9| |||  |S t||d }|sV|||d  dkrQ|d S | 	|S |
d }|d| }| | |d S |
d }| jd ur|| jkr| |||  |S | | |   |S )	Nr-   r)   zunexpected call to parse_endtagr   r4   r0   z</>r   )r   	endendtagr:   rP   
endtagfindr@   r   r<   rf   r\   rN   r   r7   handle_endtagr%   )r	   rT   r   r@   r]   	namematchtagnamer"   r
   r
   r   rB     s8   





zHTMLParser.parse_endtagc                 C   s   |  || | | d S r$   )rm   r}   r	   rq   rp   r
   r
   r   rl     s   zHTMLParser.handle_startendtagc                 C      d S r$   r
   r   r
   r
   r   rm        zHTMLParser.handle_starttagc                 C   r   r$   r
   )r	   rq   r
   r
   r   r}     r   zHTMLParser.handle_endtagc                 C   r   r$   r
   r	   rZ   r
   r
   r   rO     r   zHTMLParser.handle_charrefc                 C   r   r$   r
   r   r
   r
   r   rR     r   zHTMLParser.handle_entityrefc                 C   r   r$   r
   r   r
   r
   r   r<     r   zHTMLParser.handle_datac                 C   r   r$   r
   r   r
   r
   r   rG     r   zHTMLParser.handle_commentc                 C   r   r$   r
   )r	   declr
   r
   r   rJ     r   zHTMLParser.handle_declc                 C   r   r$   r
   r   r
   r
   r   rK     r   zHTMLParser.handle_pic                 C   r   r$   r
   r   r
   r
   r   rI     r   zHTMLParser.unknown_decl)r   )__name__
__module____qualname____doc__rn   r   r   r   r   r   r   r#   r%   r   rE   r\   rD   rA   re   rB   rl   rm   r}   rO   rR   r<   rG   rJ   rK   rI   r
   r
   r
   r   r   ?   s:    		 
3"()r   r   r   htmlr   __all__r    r   rS   rQ   rM   r?   rF   r`   commentcloserf   rg   VERBOSEry   r{   r|   r   r   r
   r
   r
   r   <module>   s.    











